diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..5e97f52 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length=120 +ignore = E203,E305,E402,E721,E741,F401,F403,F405,F821,F841,F999,W503,W504 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d826437 --- /dev/null +++ b/.gitignore @@ -0,0 +1,145 @@ +# Data files +.data + +# Results folders +run_kaggle_pt/ +results/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..8b32f06 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "param"] + path = param + url = https://github.com/facebookresearch/param.git +[submodule "benchmarks/rnnt/ootb/inference/third_party/pybind"] + path = benchmarks/rnnt/ootb/inference/third_party/pybind + url = https://github.com/pybind/pybind11.git diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..08b500a --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,80 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +This Code of Conduct also applies outside the project spaces when there is a +reasonable belief that an individual's behavior may have a negative impact on +the project or its community. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..1665402 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing to proxyworkloads +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `main`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## License +By contributing to proxyworkloads, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..bb84d29 --- /dev/null +++ b/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2021 Meta Platforms, Inc. and its affiliates. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..2bb9ad2 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,176 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f2e38c0 --- /dev/null +++ b/README.md @@ -0,0 +1,131 @@ +# Proxy Workloads + +These benchmarks represent important workloads. The faster these benchmarks are, the happier owners of important workloads are. The maintainers, updates, and rules in this benchmark suite all exist to keep the connection between the people running these benchmarks and the people running the original workloads. + +The key things to know: +- These benchmarks are directly connected to real workloads run every day +- The main metric is throughput, subject to some constraints such as latency or max batchsize +- Data is often synthetic, though we have safeguards to ensure correctness +- There are special requirements when improving these benchmarks - it's not "anything goes" +- This includes benchmarks (runnable on 1 device, multiple devices, clusters) and microbenchmarks + + +To get starting running the benchmark suite right away on a V100: + + cd proxyworkloads/benchmarks + ./run_all.sh + + +## The Suite + +This suite captures benchmarks across multiple devices, across multiple precisions, and includes microbenchmarks. We organize the suite so each benchmark result is identified as: + + Benchmark = Models + Implementation + Mode + Configuration + +### Models +This suite contains the following benchmarks: +- Recommendation: DLRM +- Text: XLM-R (WIP) +- Vision: CVT (Planned) +- Text: OSCAR (Planned) +- Speech: RNN-T (WIP) +- Video: Resnext-3D (Planned) +- Image: Regnet-Y (Planned) + +### Implementation + +Each benchmark comes in three different implementations: +- Out Of The Box (OOTB): indicates the performance that is provided by the libraries and frameworks. Code is written like a regular AI engineer / researcher would write the code, not like a systems/hardware specialist would write the code. +- Optimized: Represents the best possible performance which can be reached; the code is tuned, re-written (and perhaps even mangled) by hardware and software experts +- Microbenchmarks: benchmarks which look at a specific component of dev, computer or cluster. These are highly unique and specialized in their purpose. + +### Modes + +For OOTB and optimized implementations, the modes are Inference and Training. For Microbenchmarks, the mode is the specific kind of microbenchmark being run. + +### Configurations + +Each implementation comes in multiple configurations. Each configuration looks at the benchmark in a different way, such as: +- The model and data scaled to different number of devices: e.g. 1, 8, multiple node +- Different precisions and numeric formats +- Different variants of the models, representing possible different layers or sizes the model might be run at. + +## Results + +Running one or more benchmarks on a specific machine or cluster produces a results table. Below are example results which you may get. + +|Model |Implementation|Mode |Config |Batch Size|Score |Units| +|-------------------------|--------------|------------|-------------------|----------|------|-----| +|Recommend: DLRM |OOTB |Training |A.1dev-embed32-fp32|1024 |570.16|ex/s | +|Recommend: DLRM |OOTB |Inference |A.1dev-embed4-fp32 |1024 |61.85*|ex/s | +|Recommend: DLRM |Micro |MLP/Linear |linear_A.1dev |256 |7.08 |TF/s | +|Recommend: DLRM |Micro |EmbeddingBag|emb_A.1dev |65536 |537.80|GB/s | +* = missed latency target + +Notice the following in this table: +- Each row is one Benchmark run with a batch size (`Model + Implementation + Mode + Config` at a given batch size). More on batch size in Suite Design. +- All rows in the same table are run on the same machine. Benchmarks from different hardware must appear in different result tables. +- Some results have a `*` denoting that they missed the latency target. More on latency targets in Suite Design. +- You may report multiple batch sizes for the same benchmark, they appear as different lines in the table. + + +### Results by System Scale +We look at all the results to understand the broader picture of performance. + +** For systems that can't run the full model: ** Microbenchmarks give us a picture into potential performance and early indicators of where to explore more. + +** For single device systems: ** For training, single device configurations and microbenchmarks can indicate trends in overall cluster performance; microbenchmarks run on the cluster paired with single device results can indicate if single device performance is in fact the bottleneck. For inference, single inference is often easily parallelizable across multiple devices, the single device benchmarks are a very good indicator of real performance. This has the added advantage of being quick and easy for debugging and experiments. + +** For multiple device, single node: ** For Training, multidevice configurations give good insight into how single nodes perform within a cluster - this can be combined with microbenchmarks on the cluster to predict overall performance. For inference, this is a great reflection of actual workloads. This has the added advantage of being quick and easy for debugging and experiments. + +** For Clusters: ** Running these benchmarks on a cluster gives the best indication of performance for Training but does not add additional information for Inference. The downside is, obviously, these runs are more costly to set up and run. + + +### How Results are Consumed +There are two broad comparisons that can be done: hardware-to-hardware and OOTB v. Optimized. + +- System to System: Compare two tables generated by two different systems to understand their differences +- OOTB v. Optimized: Look at one table, one system, and understand the gap between the software (compilers, frameworks, and libraries) and what might be possible if the software was improved. + +Generally, consuming results is specific to the situation. Different goals will result in placing different priorities and weights when evaluating results so there isn't a one size fits all approach here. It's up to the people and situation. + + +## Suite Design +We are very specific about how these benchmarks must be run and optimized in order to maintain our goal: ** improvements to these benchmarks connect directly to improvements in important internal workloads **. Where our methodology may seem arbitrary or cumbersome, it is in service of maintaining the connection to the source. + +### Ownership, Versions & Updates +Each Benchmark (`Model + Implementation + Mode + Config`) is connected with an actual owner of an actual workload who endorsed the benchmark. The owner is the arbiter of changes, updates, and methodology for the benchmark. It is exceptionally frustrating to see benchmarks change while you are working on them. It sucks, and we version our benchmarks to help with bookkeeping. Ultimately, our goal here is to reflect the current state of what people care about - unfortunately this means (sometimes too frequently) bumping versions to ensure we are offering the best proxy to the world. + +### Convergence and Accuracy +The gold standard in understanding how the system works is measuring convergence and accuracy of the model in the end-to-end context. Unfortunately, as shown by MLPerf, this is exceptionally costly, burdensome and slow. We do not place an emphasis on convergence and accuracy for the following reasons: +- We don't allow significant changes to model code (see "Improving the Benchmark Score"), so we don't expect people to be breaking convergence +- We limit the data types and precisions to ones we understand and are known to be viable +- We (will) offer the ability to verify correctness (possibly through real data or through statistical analysis on synthetic data) +- We lean on benchmarks in MLPerf which has a similar suite of models and submissions to MLPerf are required to test correctness. + +Overall, we aim to allow benchmarking at the granularity which is usable by people in their projects, representative of the actual workloads, and not overly cumbersome or expensive. It's a compromise. + +### Data +As discussed in Convergence and Accuracy, we are not an accuracy or convergence benchmark. This frees us up to use synthetic data which significantly improves usability and time-to-results for this suite. + +We may choose to use real data, or data derived from real data, where we cannot generate proper synthetic data. + +### Batch Sizes +Generally speaking, the bigger the batch size the better the throughput but the longer the time to converge and the higher the latency. When running these benchmarks, people will want to see: +- The benchmark run at specific known batch sizes (where the convergence is understood) to allow for predicting and modeling +- The benchmark at the batch size which gives the best throughput, subject to either (a) a maximum batchsize for which the model will converge, or (b) a latency requirement for requests. + +### Latency Limits +Inference benchmarks come with latency limits and the goal is to provide the best QPS while hitting the latency limit. Some inference benchmarks may reflect user facing operations where latency is key. Some inference benchmarks may reflect background jobs where throughput is key - so the latency limit is very high in these cases. + +## Improving the Benchmark Score +The bigger the score, the better - but there are limits on how to get there. The limits depend on the implementation (Out-Of-The-Box OOTB, Optimized, or Microbenchmark). + +- Out-Of-The-Box (OOTB): Improvements must come in through libraries, frameworks, and new hardware. No changing the model code (special exceptions for non-optimizing changes which enable porting to new hardware). +- Optimized: No holds barred - make the system shine. Just keep in mind everything you do, you're asking the actual people who run the workloads to do it too if they're going to realize that performance. You'll need to describe what changes you made, so keep track. +- Microbenchmarks - Implement the same operation as defined, and make it as fast as possible. + +## License + +This is released under the APACHE 2 license. Please see the [`LICENSE`](LICENSE) file for more information. + diff --git a/benchmarks/dlrm/ootb/CODE_OF_CONDUCT.md b/benchmarks/dlrm/ootb/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..0f7ad8b --- /dev/null +++ b/benchmarks/dlrm/ootb/CODE_OF_CONDUCT.md @@ -0,0 +1,5 @@ +# Code of Conduct + +Facebook has adopted a Code of Conduct that we expect project participants to adhere to. +Please read the [full text](https://code.fb.com/codeofconduct/) +so that you can understand what actions will and will not be tolerated. diff --git a/benchmarks/dlrm/ootb/CONTRIBUTING.md b/benchmarks/dlrm/ootb/CONTRIBUTING.md new file mode 100644 index 0000000..cc013a1 --- /dev/null +++ b/benchmarks/dlrm/ootb/CONTRIBUTING.md @@ -0,0 +1,36 @@ +# Contributing to DLRM +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## Coding Style +* 4 spaces for indentation rather than tabs +* 80 character line length +* in general, please maintain a consistent style with the rest of the code + +## License +By contributing to DLRM, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. diff --git a/benchmarks/dlrm/ootb/Dockerfile b/benchmarks/dlrm/ootb/Dockerfile new file mode 100644 index 0000000..0e4b750 --- /dev/null +++ b/benchmarks/dlrm/ootb/Dockerfile @@ -0,0 +1,15 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +ARG FROM_IMAGE_NAME=pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime +FROM ${FROM_IMAGE_NAME} + +ADD requirements.txt . +RUN pip install -r requirements.txt + +RUN pip install torch==1.3.1 + +WORKDIR /code +ADD . . diff --git a/benchmarks/dlrm/ootb/README.md b/benchmarks/dlrm/ootb/README.md new file mode 100644 index 0000000..7096b83 --- /dev/null +++ b/benchmarks/dlrm/ootb/README.md @@ -0,0 +1,389 @@ +Deep Learning Recommendation Model for Personalization and Recommendation Systems: +================================================================================= +*Copyright (c) Facebook, Inc. and its affiliates.* + +Description: +------------ +An implementation of a deep learning recommendation model (DLRM) +The model input consists of dense and sparse features. The former is a vector +of floating point values. The latter is a list of sparse indices into +embedding tables, which consist of vectors of floating point values. +The selected vectors are passed to mlp networks denoted by triangles, +in some cases the vectors are interacted through operators (Ops). +``` +output: + probability of a click +model: | + /\ + /__\ + | + _____________________> Op <___________________ + / | \ + /\ /\ /\ + /__\ /__\ ... /__\ + | | | + | Op Op + | ____/__\_____ ____/__\____ + | |_Emb_|____|__| ... |_Emb_|__|___| +input: +[ dense features ] [sparse indices] , ..., [sparse indices] +``` + More precise definition of model layers: + 1) fully connected layers of an mlp + + z = f(y) + + y = Wx + b + + 2) embedding lookup (for a list of sparse indices p=[p1,...,pk]) + + z = Op(e1,...,ek) + + obtain vectors e1=E[:,p1], ..., ek=E[:,pk] + + 3) Operator Op can be one of the following + + Sum(e1,...,ek) = e1 + ... + ek + + Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek] + + Cat(e1,...,ek) = [e1', ..., ek']' + + where ' denotes transpose operation + +Cite [Work](https://arxiv.org/abs/1906.00091): +``` +@article{DLRM19, + author = {Maxim Naumov and Dheevatsa Mudigere and Hao{-}Jun Michael Shi and Jianyu Huang and Narayanan Sundaraman and Jongsoo Park and Xiaodong Wang and Udit Gupta and Carole{-}Jean Wu and Alisson G. Azzolini and Dmytro Dzhulgakov and Andrey Mallevich and Ilia Cherniavskii and Yinghai Lu and Raghuraman Krishnamoorthi and Ansha Yu and Volodymyr Kondratenko and Stephanie Pereira and Xianjie Chen and Wenlin Chen and Vijay Rao and Bill Jia and Liang Xiong and Misha Smelyanskiy}, + title = {Deep Learning Recommendation Model for Personalization and Recommendation Systems}, + journal = {CoRR}, + volume = {abs/1906.00091}, + year = {2019}, + url = {https://arxiv.org/abs/1906.00091}, +} +``` + +Related Work: + +On the [system architecture implications](https://arxiv.org/abs/1906.03109), with DLRM as one of the benchmarks, +``` +@article{ArchImpl19, + author = {Udit Gupta and Xiaodong Wang and Maxim Naumov and Carole{-}Jean Wu and Brandon Reagen and David Brooks and Bradford Cottel and Kim M. Hazelwood and Bill Jia and Hsien{-}Hsin S. Lee and Andrey Malevich and Dheevatsa Mudigere and Mikhail Smelyanskiy and Liang Xiong and Xuan Zhang}, + title = {The Architectural Implications of Facebook's DNN-based Personalized Recommendation}, + journal = {CoRR}, + volume = {abs/1906.03109}, + year = {2019}, + url = {https://arxiv.org/abs/1906.03109}, +} +``` + +On the [embedding compression techniques (for number of vectors)](https://arxiv.org/abs/1909.02107), with DLRM as one of the benchmarks, +``` +@article{QuoRemTrick19, + author = {Hao{-}Jun Michael Shi and Dheevatsa Mudigere and Maxim Naumov and Jiyan Yang}, + title = {Compositional Embeddings Using Complementary Partitions for Memory-Efficient Recommendation Systems}, + journal = {CoRR}, + volume = {abs/1909.02107}, + year = {2019}, + url = {https://arxiv.org/abs/1909.02107}, +} +``` + +On the [embedding compression techniques (for dimension of vectors)](https://arxiv.org/abs/1909.11810), with DLRM as one of the benchmarks, +``` +@article{MixDimTrick19, + author = {Antonio Ginart and Maxim Naumov and Dheevatsa Mudigere and Jiyan Yang and James Zou}, + title = {Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation Systems}, + journal = {CoRR}, + volume = {abs/1909.11810}, + year = {2019}, + url = {https://arxiv.org/abs/1909.11810}, +} +``` + +Implementation +-------------- +**DLRM PyTorch**. Implementation of DLRM in PyTorch framework: + + dlrm_s_pytorch.py + +**DLRM Caffe2**. Implementation of DLRM in Caffe2 framework: + + dlrm_s_caffe2.py + +**DLRM Data**. Implementation of DLRM data generation and loading: + + dlrm_data_pytorch.py, dlrm_data_caffe2.py, data_utils.py + +**DLRM Tests**. Implementation of DLRM tests in ./test + + dlrm_s_test.sh + +**DLRM Benchmarks**. Implementation of DLRM benchmarks in ./bench + + dlrm_s_criteo_kaggle.sh, dlrm_s_criteo_terabyte.sh, dlrm_s_benchmark.sh + +Related Work: + +On the [Glow framework](https://github.com/pytorch/glow) implementation +``` +https://github.com/pytorch/glow/blob/master/tests/unittests/RecommendationSystemTest.cpp +``` +On the [FlexFlow framework](https://github.com/flexflow/FlexFlow) distributed implementation with Legion backend +``` +https://github.com/flexflow/FlexFlow/blob/master/examples/cpp/DLRM/dlrm.cc +``` + +How to run dlrm code? +-------------------- +1) A sample run of the code, with a tiny model is shown below +``` +$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6 +time/loss/accuracy (if enabled): +Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000% +Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000% +Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000% +``` +2) A sample run of the code, with a tiny model in debug mode +``` +$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6 --debug-mode +model arch: +mlp top arch 3 layers, with input to output dimensions: +[8 4 2 1] +# of interactions +8 +mlp bot arch 2 layers, with input to output dimensions: +[4 3 2] +# of features (sparse and dense) +4 +dense feature size +4 +sparse feature size +2 +# of embeddings (= # of sparse features) 3, with dimensions 2x: +[4 3 2] +data (inputs and targets): +mini-batch: 0 +[[0.69647 0.28614 0.22685 0.55131] + [0.71947 0.42311 0.98076 0.68483]] +[[[1], [0, 1]], [[0], [1]], [[1], [0]]] +[[0.55679] + [0.15896]] +mini-batch: 1 +[[0.36179 0.22826 0.29371 0.63098] + [0.0921 0.4337 0.43086 0.49369]] +[[[1], [0, 2, 3]], [[1], [1, 2]], [[1], [1]]] +[[0.15307] + [0.69553]] +mini-batch: 2 +[[0.60306 0.54507 0.34276 0.30412] + [0.41702 0.6813 0.87546 0.51042]] +[[[2], [0, 1, 2]], [[1], [2]], [[1], [1]]] +[[0.31877] + [0.69197]] +initial parameters (weights and bias): +[[ 0.05438 -0.11105] + [ 0.42513 0.34167] + [-0.1426 -0.45641] + [-0.19523 -0.10181]] +[[ 0.23667 0.57199] + [-0.16638 0.30316] + [ 0.10759 0.22136]] +[[-0.49338 -0.14301] + [-0.36649 -0.22139]] +[[0.51313 0.66662 0.10591 0.13089] + [0.32198 0.66156 0.84651 0.55326] + [0.85445 0.38484 0.31679 0.35426]] +[0.17108 0.82911 0.33867] +[[0.55237 0.57855 0.52153] + [0.00269 0.98835 0.90534]] +[0.20764 0.29249] +[[0.52001 0.90191 0.98363 0.25754 0.56436 0.80697 0.39437 0.73107] + [0.16107 0.6007 0.86586 0.98352 0.07937 0.42835 0.20454 0.45064] + [0.54776 0.09333 0.29686 0.92758 0.569 0.45741 0.75353 0.74186] + [0.04858 0.7087 0.83924 0.16594 0.781 0.28654 0.30647 0.66526]] +[0.11139 0.66487 0.88786 0.69631] +[[0.44033 0.43821 0.7651 0.56564] + [0.0849 0.58267 0.81484 0.33707]] +[0.92758 0.75072] +[[0.57406 0.75164]] +[0.07915] +DLRM_Net( + (emb_l): ModuleList( + (0): EmbeddingBag(4, 2, mode=sum) + (1): EmbeddingBag(3, 2, mode=sum) + (2): EmbeddingBag(2, 2, mode=sum) + ) + (bot_l): Sequential( + (0): Linear(in_features=4, out_features=3, bias=True) + (1): ReLU() + (2): Linear(in_features=3, out_features=2, bias=True) + (3): ReLU() + ) + (top_l): Sequential( + (0): Linear(in_features=8, out_features=4, bias=True) + (1): ReLU() + (2): Linear(in_features=4, out_features=2, bias=True) + (3): ReLU() + (4): Linear(in_features=2, out_features=1, bias=True) + (5): Sigmoid() + ) +) +time/loss/accuracy (if enabled): +Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000% +Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000% +Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000% +updated parameters (weights and bias): +[[ 0.0543 -0.1112 ] + [ 0.42513 0.34167] + [-0.14283 -0.45679] + [-0.19532 -0.10197]] +[[ 0.23667 0.57199] + [-0.1666 0.30285] + [ 0.10751 0.22124]] +[[-0.49338 -0.14301] + [-0.36664 -0.22164]] +[[0.51313 0.66663 0.10591 0.1309 ] + [0.32196 0.66154 0.84649 0.55324] + [0.85444 0.38482 0.31677 0.35425]] +[0.17109 0.82907 0.33863] +[[0.55238 0.57857 0.52154] + [0.00265 0.98825 0.90528]] +[0.20764 0.29244] +[[0.51996 0.90184 0.98368 0.25752 0.56436 0.807 0.39437 0.73107] + [0.16096 0.60055 0.86596 0.98348 0.07938 0.42842 0.20453 0.45064] + [0.5476 0.0931 0.29701 0.92752 0.56902 0.45752 0.75351 0.74187] + [0.04849 0.70857 0.83933 0.1659 0.78101 0.2866 0.30646 0.66526]] +[0.11137 0.66482 0.88778 0.69627] +[[0.44029 0.43816 0.76502 0.56561] + [0.08485 0.5826 0.81474 0.33702]] +[0.92754 0.75067] +[[0.57379 0.7514 ]] +[0.07908] +``` + +Testing +------- +Testing scripts to confirm functional correctness of the code +``` +./test/dlrm_s_test.sh +Running commands ... +python dlrm_s_pytorch.py +python dlrm_s_caffe2.py +Checking results ... +diff test1 (no numeric values in the output = SUCCESS) +diff test2 (no numeric values in the output = SUCCESS) +diff test3 (no numeric values in the output = SUCCESS) +diff test4 (no numeric values in the output = SUCCESS) +``` + +*NOTE: Testing scripts accept extra arguments which will be passed along to the model, such as --use-gpu* + +Benchmarking +------------ +1) Performance benchmarking + ``` + ./bench/dlrm_s_benchmark.sh + ``` + +2) The code supports interface with the [Criteo Kaggle Display Advertising Challenge Dataset](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/). + - Please do the following to prepare the dataset for use with DLRM code: + - First, specify the raw data file (train.txt) as downloaded with --raw-data-file= + - This is then pre-processed (categorize, concat across days...) to allow using with dlrm code + - The processed data is stored as *.npz file in /input/*.npz + - The processed file (*.npz) can be used for subsequent runs with --processed-data-file= + - The model can be trained using the following script + ``` + ./bench/dlrm_s_criteo_kaggle.sh [--test-freq=1024] + ``` + + + +3) The code supports interface with the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/). + - Please do the following to prepare the dataset for use with DLRM code: + - First, download the raw data files day_0.gz, ...,day_23.gz and unzip them + - Specify the location of the unzipped text files day_0, ...,day_23, using --raw-data-file= (the day number will be appended automatically) + - These are then pre-processed (categorize, concat across days...) to allow using with dlrm code + - The processed data is stored as *.npz file in /input/*.npz + - The processed file (*.npz) can be used for subsequent runs with --processed-data-file= + - The model can be trained using the following script + ``` + ./bench/dlrm_s_criteo_terabyte.sh ["--test-freq=10240 --memory-map --data-sub-sample-rate=0.875"] + ``` + - Corresponding pre-trained model is available under [CC-BY-NC license](https://creativecommons.org/licenses/by-nc/2.0/) and can be downloaded here + [dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt) + + + +*NOTE: Benchmarking scripts accept extra arguments which will be passed along to the model, such as --num-batches=100 to limit the number of data samples* + +4) The code supports interface with [MLPerf benchmark](https://mlperf.org). + - Please refer to the following training parameters + ``` + --mlperf-logging that keeps track of multiple metrics, including area under the curve (AUC) + + --mlperf-acc-threshold that allows early stopping based on accuracy metric + + --mlperf-auc-threshold that allows early stopping based on AUC metric + + --mlperf-bin-loader that enables preprocessing of data into a single binary file + + --mlperf-bin-shuffle that controls whether a random shuffle of mini-batches is performed + ``` + - The MLPerf training model is completely specified and can be trained using the following script + ``` + ./bench/run_and_time.sh [--use-gpu] + ``` + - Corresponding pre-trained model is available under [CC-BY-NC license](https://creativecommons.org/licenses/by-nc/2.0/) and can be downloaded here + [dlrm_emb128_subsample0.0_maxindrange40M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt) + +5) The code now supports synchronous distributed training, we support gloo/nccl/mpi backend, we provide launching mode for [pytorch distributed launcher](https://pytorch.org/docs/stable/distributed.html#launch-utility) and Mpirun. For MPI, users need to write their own MPI launching scripts for configuring the running hosts. For example, using pytorch distributed launcher, we can have the following command as launching scripts: +``` +# for single node 8 gpus and nccl as backend on randomly generated dataset: +python -m torch.distributed.launch --nproc_per_node=8 dlrm_s_pytorch.py --arch-embedding-size="80000-80000-80000-80000-80000-80000-80000-80000" --arch-sparse-feature-size=64 --arch-mlp-bot="128-128-128-128" --arch-mlp-top="512-512-512-256-1" --max-ind-range=40000000 +--data-generation=random --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2 --print-time --test-freq=2 --test-mini-batch-size=2048 --memory-map --use-gpu --num-batches=100 --dist-backend=nccl + +# for multiple nodes, user can add the related argument according to the launcher manual like: +--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" --master_port=1234 +``` + + +Model checkpoint saving/loading +------------------------------- +During training, the model can be saved using --save-model= + +The model is saved if there is an improvement in test accuracy (which is checked at --test-freq intervals). + +A previously saved model can be loaded using --load-model= + +Once loaded the model can be used to continue training, with the saved model being a checkpoint. +Alternatively, the saved model can be used to evaluate only on the test data-set by specifying --inference-only option. + + +Version +------- +0.1 : Initial release of the DLRM code + +1.0 : DLRM with distributed training, cpu support for row-wise adagrad optimizer + +Requirements +------------ +pytorch-nightly (*11/10/20*) + +scikit-learn + +numpy + +onnx (*optional*) + +pydot (*optional*) + +torchviz (*optional*) + +mpi (*optional for distributed backend*) + + +License +------- +This source code is licensed under the MIT license found in the +LICENSE file in the root directory of this source tree. diff --git a/benchmarks/dlrm/ootb/bench/dlrm_s_benchmark.sh b/benchmarks/dlrm/ootb/bench/dlrm_s_benchmark.sh new file mode 100755 index 0000000..c6a75e2 --- /dev/null +++ b/benchmarks/dlrm/ootb/bench/dlrm_s_benchmark.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#check if extra argument is passed to the test +if [[ $# == 1 ]]; then + dlrm_extra_option=$1 +else + dlrm_extra_option="" +fi +#echo $dlrm_extra_option + +cpu=1 +gpu=1 +pt=1 +c2=1 + +ncores=28 #12 #6 +nsockets="0" + +ngpus="1 2 4 8" + +numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT +dlrm_pt_bin="python dlrm_s_pytorch.py" +dlrm_c2_bin="python dlrm_s_caffe2.py" + +data=random #synthetic +print_freq=100 +rand_seed=727 + +c2_net="async_scheduling" + +#Model param +mb_size=2048 #1024 #512 #256 +nbatches=1000 #500 #100 +bot_mlp="512-512-64" +top_mlp="1024-1024-1024-1" +emb_size=64 +nindices=100 +emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000" +interaction="dot" +tnworkers=0 +tmb_size=16384 + +#_args="--mini-batch-size="${mb_size}\ +_args=" --num-batches="${nbatches}\ +" --data-generation="${data}\ +" --arch-mlp-bot="${bot_mlp}\ +" --arch-mlp-top="${top_mlp}\ +" --arch-sparse-feature-size="${emb_size}\ +" --arch-embedding-size="${emb}\ +" --num-indices-per-lookup="${nindices}\ +" --arch-interaction-op="${interaction}\ +" --numpy-rand-seed="${rand_seed}\ +" --print-freq="${print_freq}\ +" --print-time"\ +" --enable-profiling " + +c2_args=" --caffe2-net-type="${c2_net} + + +# CPU Benchmarking +if [ $cpu = 1 ]; then + echo "--------------------------------------------" + echo "CPU Benchmarking - running on $ncores cores" + echo "--------------------------------------------" + if [ $pt = 1 ]; then + outf="model1_CPU_PT_$ncores.log" + outp="dlrm_s_pytorch.prof" + echo "-------------------------------" + echo "Running PT (log file: $outf)" + echo "-------------------------------" + cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf" + echo $cmd + eval $cmd + min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}') + echo "Min time per iteration = $min" + # move profiling file(s) + mv $outp ${outf//".log"/".prof"} + mv ${outp//".prof"/".json"} ${outf//".log"/".json"} + + fi + if [ $c2 = 1 ]; then + outf="model1_CPU_C2_$ncores.log" + outp="dlrm_s_caffe2.prof" + echo "-------------------------------" + echo "Running C2 (log file: $outf)" + echo "-------------------------------" + cmd="$numa_cmd $dlrm_c2_bin --mini-batch-size=$mb_size $_args $c2_args $dlrm_extra_option 1> $outf 2> $outp" + echo $cmd + eval $cmd + min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}') + echo "Min time per iteration = $min" + # move profiling file (collected from stderr above) + mv $outp ${outf//".log"/".prof"} + fi +fi + +# GPU Benchmarking +if [ $gpu = 1 ]; then + echo "--------------------------------------------" + echo "GPU Benchmarking - running on $ngpus GPUs" + echo "--------------------------------------------" + for _ng in $ngpus + do + # weak scaling + # _mb_size=$((mb_size*_ng)) + # strong scaling + _mb_size=$((mb_size*1)) + _gpus=$(seq -s, 0 $((_ng-1))) + cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus" + echo "-------------------" + echo "Using GPUS: "$_gpus + echo "-------------------" + if [ $pt = 1 ]; then + outf="model1_GPU_PT_$_ng.log" + outp="dlrm_s_pytorch.prof" + echo "-------------------------------" + echo "Running PT (log file: $outf)" + echo "-------------------------------" + cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf" + echo $cmd + eval $cmd + min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}') + echo "Min time per iteration = $min" + # move profiling file(s) + mv $outp ${outf//".log"/".prof"} + mv ${outp//".prof"/".json"} ${outf//".log"/".json"} + fi + if [ $c2 = 1 ]; then + outf="model1_GPU_C2_$_ng.log" + outp="dlrm_s_caffe2.prof" + echo "-------------------------------" + echo "Running C2 (log file: $outf)" + echo "-------------------------------" + cmd="$cuda_arg $dlrm_c2_bin --mini-batch-size=$_mb_size $_args $c2_args --use-gpu $dlrm_extra_option 1> $outf 2> $outp" + echo $cmd + eval $cmd + min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}') + echo "Min time per iteration = $min" + # move profiling file (collected from stderr above) + mv $outp ${outf//".log"/".prof"} + fi + done +fi diff --git a/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_kaggle.sh b/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_kaggle.sh new file mode 100755 index 0000000..867d8c0 --- /dev/null +++ b/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_kaggle.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +#WARNING: must have compiled PyTorch and caffe2 + +#check if extra argument is passed to the test +if [[ $# == 1 ]]; then + dlrm_extra_option=$1 +else + dlrm_extra_option="" +fi +#echo $dlrm_extra_option + +dlrm_pt_bin="python dlrm_s_pytorch.py" +dlrm_c2_bin="python dlrm_s_caffe2.py" + +echo "run pytorch ..." +# WARNING: the following parameters will be set based on the data set +# --arch-embedding-size=... (sparse feature sizes) +# --arch-mlp-bot=... (the input to the first layer of bottom mlp) +$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log + +echo "run caffe2 ..." +# WARNING: the following parameters will be set based on the data set +# --arch-embedding-size=... (sparse feature sizes) +# --arch-mlp-bot=... (the input to the first layer of bottom mlp) +$dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log + +echo "done" diff --git a/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_terabyte.sh b/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_terabyte.sh new file mode 100755 index 0000000..5a4ee94 --- /dev/null +++ b/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_terabyte.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +#WARNING: must have compiled PyTorch and caffe2 + +#check if extra argument is passed to the test +if [[ $# == 1 ]]; then + dlrm_extra_option=$1 +else + dlrm_extra_option="" +fi +#echo $dlrm_extra_option + +dlrm_pt_bin="python dlrm_s_pytorch.py" +dlrm_c2_bin="python dlrm_s_caffe2.py" + +echo "run pytorch ..." +# WARNING: the following parameters will be set based on the data set +# --arch-embedding-size=... (sparse feature sizes) +# --arch-mlp-bot=... (the input to the first layer of bottom mlp) +$dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log + +echo "run caffe2 ..." +# WARNING: the following parameters will be set based on the data set +# --arch-embedding-size=... (sparse feature sizes) +# --arch-mlp-bot=... (the input to the first layer of bottom mlp) +$dlrm_c2_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_terabyte_c2.log + +echo "done" diff --git a/benchmarks/dlrm/ootb/bench/run_and_time.sh b/benchmarks/dlrm/ootb/bench/run_and_time.sh new file mode 100755 index 0000000..e241d80 --- /dev/null +++ b/benchmarks/dlrm/ootb/bench/run_and_time.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +#WARNING: must have compiled PyTorch and caffe2 + +#check if extra argument is passed to the test +if [[ $# == 1 ]]; then + dlrm_extra_option=$1 +else + dlrm_extra_option="" +fi +#echo $dlrm_extra_option + +python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log + +echo "done" diff --git a/benchmarks/dlrm/ootb/cython/cython_compile.py b/benchmarks/dlrm/ootb/cython/cython_compile.py new file mode 100644 index 0000000..ffacf08 --- /dev/null +++ b/benchmarks/dlrm/ootb/cython/cython_compile.py @@ -0,0 +1,26 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: compile .so from python code + +from __future__ import absolute_import, division, print_function, unicode_literals + +from setuptools import setup +from Cython.Build import cythonize +from distutils.extension import Extension + +ext_modules = [ + Extension( + "data_utils_cython", + ["data_utils_cython.pyx"], + extra_compile_args=['-O3'], + extra_link_args=['-O3'], + ) +] + +setup( + name='data_utils_cython', + ext_modules=cythonize(ext_modules) +) diff --git a/benchmarks/dlrm/ootb/cython/cython_criteo.py b/benchmarks/dlrm/ootb/cython/cython_criteo.py new file mode 100644 index 0000000..46a0b7d --- /dev/null +++ b/benchmarks/dlrm/ootb/cython/cython_criteo.py @@ -0,0 +1,55 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: run dataset pre-processing in standalone mode +# WARNING: These steps are required to work with Cython +# 1. Instal Cython +# > sudo yum install Cython +# 2. Please copy data_utils.py into data_utils_cython.pyx +# 3. Compile the data_utils_cython.pyx to generate .so +# (it's important to keep extension .pyx rather than .py +# to ensure the C/C++ .so no .py is loaded at import time) +# > python cython_compile.py build_ext --inplace +# This should create data_utils_cython.so, which can be loaded below with "import" +# 4. Run standalone datatset preprocessing to generate .npz files +# a. Kaggle +# > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt +# --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz +# b. Terabyte +# > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte +# --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz + +from __future__ import absolute_import, division, print_function, unicode_literals + +import data_utils_cython as duc + +if __name__ == "__main__": + ### import packages ### + import argparse + + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Preprocess Criteo dataset" + ) + # model related parameters + parser.add_argument("--max-ind-range", type=int, default=-1) + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--data-randomize", type=str, default="total") # or day or none + parser.add_argument("--memory-map", action="store_true", default=False) + parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + args = parser.parse_args() + + duc.loadDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map + ) diff --git a/benchmarks/dlrm/ootb/data_loader_terabyte.py b/benchmarks/dlrm/ootb/data_loader_terabyte.py new file mode 100644 index 0000000..cf0db71 --- /dev/null +++ b/benchmarks/dlrm/ootb/data_loader_terabyte.py @@ -0,0 +1,368 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import numpy as np +from torch.utils.data import Dataset +import torch +import time +import math +from tqdm import tqdm +import argparse + + +class DataLoader: + """ + DataLoader dedicated for the Criteo Terabyte Click Logs dataset + """ + + def __init__( + self, + data_filename, + data_directory, + days, + batch_size, + max_ind_range=-1, + split="train", + drop_last_batch=False + ): + self.data_filename = data_filename + self.data_directory = data_directory + self.days = days + self.batch_size = batch_size + self.max_ind_range = max_ind_range + + total_file = os.path.join( + data_directory, + data_filename + "_day_count.npz" + ) + with np.load(total_file) as data: + total_per_file = data["total_per_file"][np.array(days)] + + self.length = sum(total_per_file) + if split == "test" or split == "val": + self.length = int(np.ceil(self.length / 2.)) + self.split = split + self.drop_last_batch = drop_last_batch + + def __iter__(self): + return iter( + _batch_generator( + self.data_filename, self.data_directory, self.days, + self.batch_size, self.split, self.drop_last_batch, self.max_ind_range + ) + ) + + def __len__(self): + if self.drop_last_batch: + return self.length // self.batch_size + else: + return math.ceil(self.length / self.batch_size) + + +def _transform_features( + x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False +): + if max_ind_range > 0: + x_cat_batch = x_cat_batch % max_ind_range + + if flag_input_torch_tensor: + x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1) + x_cat_batch = x_cat_batch.clone().detach().type(torch.long) + y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1) + else: + x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1) + x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long) + y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1) + + batch_size = x_cat_batch.shape[0] + feature_count = x_cat_batch.shape[1] + lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1) + + return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1) + + +def _batch_generator( + data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range +): + previous_file = None + for day in days: + filepath = os.path.join( + data_directory, + data_filename + "_{}_reordered.npz".format(day) + ) + + # print('Loading file: ', filepath) + with np.load(filepath) as data: + x_int = data["X_int"] + x_cat = data["X_cat"] + y = data["y"] + + samples_in_file = y.shape[0] + batch_start_idx = 0 + if split == "test" or split == "val": + length = int(np.ceil(samples_in_file / 2.)) + if split == "test": + samples_in_file = length + elif split == "val": + batch_start_idx = samples_in_file - length + + while batch_start_idx < samples_in_file - batch_size: + + missing_samples = batch_size + if previous_file is not None: + missing_samples -= previous_file['y'].shape[0] + + current_slice = slice(batch_start_idx, batch_start_idx + missing_samples) + + x_int_batch = x_int[current_slice] + x_cat_batch = x_cat[current_slice] + y_batch = y[current_slice] + + if previous_file is not None: + x_int_batch = np.concatenate( + [previous_file['x_int'], x_int_batch], + axis=0 + ) + x_cat_batch = np.concatenate( + [previous_file['x_cat'], x_cat_batch], + axis=0 + ) + y_batch = np.concatenate([previous_file['y'], y_batch], axis=0) + previous_file = None + + if x_int_batch.shape[0] != batch_size: + raise ValueError('should not happen') + + yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range) + + batch_start_idx += missing_samples + if batch_start_idx != samples_in_file: + current_slice = slice(batch_start_idx, samples_in_file) + if previous_file is not None: + previous_file = { + 'x_int' : np.concatenate( + [previous_file['x_int'], x_int[current_slice]], + axis=0 + ), + 'x_cat' : np.concatenate( + [previous_file['x_cat'], x_cat[current_slice]], + axis=0 + ), + 'y' : np.concatenate([previous_file['y'], y[current_slice]], axis=0) + } + else: + previous_file = { + 'x_int' : x_int[current_slice], + 'x_cat' : x_cat[current_slice], + 'y' : y[current_slice] + } + + if not drop_last: + yield _transform_features( + previous_file['x_int'], + previous_file['x_cat'], + previous_file['y'], + max_ind_range + ) + + +def _test(): + generator = _batch_generator( + data_filename='day', + data_directory='./input', + days=range(23), + split="train", + batch_size=2048, + drop_last=True, + max_ind_range=-1 + ) + t1 = time.time() + for x_int, lS_o, x_cat, y in generator: + t2 = time.time() + time_diff = t2 - t1 + t1 = t2 + print( + "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format( + time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape + ) + ) + + +class CriteoBinDataset(Dataset): + """Binary version of criteo dataset.""" + + def __init__(self, data_file, counts_file, + batch_size=1, max_ind_range=-1, bytes_per_feature=4): + # dataset + self.tar_fea = 1 # single target + self.den_fea = 13 # 13 dense features + self.spa_fea = 26 # 26 sparse features + self.tad_fea = self.tar_fea + self.den_fea + self.tot_fea = self.tad_fea + self.spa_fea + + self.batch_size = batch_size + self.max_ind_range = max_ind_range + self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size) + + self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry) + + print('data file:', data_file, 'number of batches:', self.num_entries) + self.file = open(data_file, 'rb') + + with np.load(counts_file) as data: + self.counts = data["counts"] + + # hardcoded for now + self.m_den = 13 + + def __len__(self): + return self.num_entries + + def __getitem__(self, idx): + self.file.seek(idx * self.bytes_per_entry, 0) + raw_data = self.file.read(self.bytes_per_entry) + array = np.frombuffer(raw_data, dtype=np.int32) + tensor = torch.from_numpy(array).view((-1, self.tot_fea)) + + return _transform_features(x_int_batch=tensor[:, 1:14], + x_cat_batch=tensor[:, 14:], + y_batch=tensor[:, 0], + max_ind_range=self.max_ind_range, + flag_input_torch_tensor=True) + + def __del__(self): + self.file.close() + + +def numpy_to_binary(input_files, output_file_path, split='train'): + """Convert the data to a binary format to be read with CriteoBinDataset.""" + + # WARNING - both categorical and numerical data must fit into int32 for + # the following code to work correctly + + with open(output_file_path, 'wb') as output_file: + if split == 'train': + for input_file in input_files: + print('Processing file: ', input_file) + + np_data = np.load(input_file) + np_data = np.concatenate([np_data['y'].reshape(-1, 1), + np_data['X_int'], + np_data['X_cat']], axis=1) + np_data = np_data.astype(np.int32) + + output_file.write(np_data.tobytes()) + else: + assert len(input_files) == 1 + np_data = np.load(input_files[0]) + np_data = np.concatenate([np_data['y'].reshape(-1, 1), + np_data['X_int'], + np_data['X_cat']], axis=1) + np_data = np_data.astype(np.int32) + + samples_in_file = np_data.shape[0] + midpoint = int(np.ceil(samples_in_file / 2.)) + if split == "test": + begin = 0 + end = midpoint + elif split == "val": + begin = midpoint + end = samples_in_file + else: + raise ValueError('Unknown split value: ', split) + + output_file.write(np_data[begin:end].tobytes()) + + +def _preprocess(args): + train_files = ['{}_{}_reordered.npz'.format(args.input_data_prefix, day) for + day in range(0, 23)] + + test_valid_file = args.input_data_prefix + '_23_reordered.npz' + + os.makedirs(args.output_directory, exist_ok=True) + for split in ['train', 'val', 'test']: + print('Running preprocessing for split =', split) + + output_file = os.path.join(args.output_directory, + '{}_data.bin'.format(split)) + + input_files = train_files if split == 'train' else [test_valid_file] + numpy_to_binary(input_files=input_files, + output_file_path=output_file, + split=split) + + +def _test_bin(): + parser = argparse.ArgumentParser() + parser.add_argument('--output_directory', required=True) + parser.add_argument('--input_data_prefix', required=True) + parser.add_argument('--split', choices=['train', 'test', 'val'], + required=True) + args = parser.parse_args() + + _preprocess(args) + + binary_data_file = os.path.join(args.output_directory, + '{}_data.bin'.format(args.split)) + + counts_file = os.path.join(args.output_directory, 'day_fea_count.npz') + dataset_binary = CriteoBinDataset(data_file=binary_data_file, + counts_file=counts_file, + batch_size=2048,) + from dlrm_data_pytorch import CriteoDataset + from dlrm_data_pytorch import collate_wrapper_criteo_offset as collate_wrapper_criteo + + binary_loader = torch.utils.data.DataLoader( + dataset_binary, + batch_size=None, + shuffle=False, + num_workers=0, + collate_fn=None, + pin_memory=False, + drop_last=False, + ) + + original_dataset = CriteoDataset( + dataset='terabyte', + max_ind_range=10 * 1000 * 1000, + sub_sample_rate=1, + randomize=True, + split=args.split, + raw_path=args.input_data_prefix, + pro_data='dummy_string', + memory_map=True + ) + + original_loader = torch.utils.data.DataLoader( + original_dataset, + batch_size=2048, + shuffle=False, + num_workers=0, + collate_fn=collate_wrapper_criteo, + pin_memory=False, + drop_last=False, + ) + + assert len(dataset_binary) == len(original_loader) + for i, (old_batch, new_batch) in tqdm(enumerate(zip(original_loader, + binary_loader)), + total=len(dataset_binary)): + + for j in range(len(new_batch)): + if not np.array_equal(old_batch[j], new_batch[j]): + raise ValueError('FAILED: Datasets not equal') + if i > len(dataset_binary): + break + print('PASSED') + + +if __name__ == '__main__': + _test() + _test_bin() diff --git a/benchmarks/dlrm/ootb/data_utils.py b/benchmarks/dlrm/ootb/data_utils.py new file mode 100644 index 0000000..bf76dff --- /dev/null +++ b/benchmarks/dlrm/ootb/data_utils.py @@ -0,0 +1,1292 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: generate inputs and targets for the DLRM benchmark +# +# Utility function(s) to download and pre-process public data sets +# - Criteo Kaggle Display Advertising Challenge Dataset +# https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset +# - Criteo Terabyte Dataset +# https://labs.criteo.com/2013/12/download-terabyte-click-logs +# +# After downloading dataset, run: +# getCriteoAdData( +# datafile="", +# o_filename=kaggleAdDisplayChallenge_processed.npz, +# max_ind_range=-1, +# sub_sample_rate=0.0, +# days=7, +# data_split='train', +# randomize='total', +# criteo_kaggle=True, +# memory_map=False +# ) +# getCriteoAdData( +# datafile="", +# o_filename=terabyte_processed.npz, +# max_ind_range=-1, +# sub_sample_rate=0.0, +# days=24, +# data_split='train', +# randomize='total', +# criteo_kaggle=False, +# memory_map=False +# ) + +from __future__ import absolute_import, division, print_function, unicode_literals + +import sys +# import os +from os import path +from multiprocessing import Process, Manager +# import io +# from io import StringIO +# import collections as coll + +import numpy as np + + +def convertUStringToDistinctIntsDict(mat, convertDicts, counts): + # Converts matrix of unicode strings into distinct integers. + # + # Inputs: + # mat (np.array): array of unicode strings to convert + # convertDicts (list): dictionary for each column + # counts (list): number of different categories in each column + # + # Outputs: + # out (np.array): array of output integers + # convertDicts (list): dictionary for each column + # counts (list): number of different categories in each column + + # check if convertDicts and counts match correct length of mat + if len(convertDicts) != mat.shape[1] or len(counts) != mat.shape[1]: + print("Length of convertDicts or counts does not match input shape") + print("Generating convertDicts and counts...") + + convertDicts = [{} for _ in range(mat.shape[1])] + counts = [0 for _ in range(mat.shape[1])] + + # initialize output + out = np.zeros(mat.shape) + + for j in range(mat.shape[1]): + for i in range(mat.shape[0]): + # add to convertDict and increment count + if mat[i, j] not in convertDicts[j]: + convertDicts[j][mat[i, j]] = counts[j] + counts[j] += 1 + out[i, j] = convertDicts[j][mat[i, j]] + + return out, convertDicts, counts + + +def convertUStringToDistinctIntsUnique(mat, mat_uni, counts): + # mat is an array of 0,...,# samples, with each being 26 categorical features + + # check if mat_unique and counts match correct length of mat + if len(mat_uni) != mat.shape[1] or len(counts) != mat.shape[1]: + print("Length of mat_unique or counts does not match input shape") + print("Generating mat_unique and counts...") + + mat_uni = [np.array([]) for _ in range(mat.shape[1])] + counts = [0 for _ in range(mat.shape[1])] + + # initialize output + out = np.zeros(mat.shape) + ind_map = [np.array([]) for _ in range(mat.shape[1])] + + # find out and assign unique ids to features + for j in range(mat.shape[1]): + m = mat_uni[j].size + mat_concat = np.concatenate((mat_uni[j], mat[:, j])) + mat_uni[j], ind_map[j] = np.unique(mat_concat, return_inverse=True) + out[:, j] = ind_map[j][m:] + counts[j] = mat_uni[j].size + + return out, mat_uni, counts + + +def processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, pre_comp_counts): + # Process Kaggle Display Advertising Challenge or Terabyte Dataset + # by converting unicode strings in X_cat to integers and + # converting negative integer values in X_int. + # + # Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day. + # + # Inputs: + # d_path (str): path for {kaggle|terabyte}_day_i.npz files + # i (int): splits in the dataset (typically 0 to 7 or 0 to 24) + + # process data if not all files exist + filename_i = npzfile + "_{0}_processed.npz".format(i) + + if path.exists(filename_i): + print("Using existing " + filename_i, end="\n") + else: + print("Not existing " + filename_i) + with np.load(npzfile + "_{0}.npz".format(i)) as data: + # categorical features + ''' + # Approach 1a: using empty dictionaries + X_cat, convertDicts, counts = convertUStringToDistinctIntsDict( + data["X_cat"], convertDicts, counts + ) + ''' + ''' + # Approach 1b: using empty np.unique + X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique( + data["X_cat"], convertDicts, counts + ) + ''' + # Approach 2a: using pre-computed dictionaries + X_cat_t = np.zeros(data["X_cat_t"].shape) + for j in range(26): + for k, x in enumerate(data["X_cat_t"][j, :]): + X_cat_t[j, k] = convertDicts[j][x] + # continuous features + X_int = data["X_int"] + X_int[X_int < 0] = 0 + # targets + y = data["y"] + + np.savez_compressed( + filename_i, + # X_cat = X_cat, + X_cat=np.transpose(X_cat_t), # transpose of the data + X_int=X_int, + y=y, + ) + print("Processed " + filename_i, end="\n") + # sanity check (applicable only if counts have been pre-computed & are re-computed) + # for j in range(26): + # if pre_comp_counts[j] != counts[j]: + # sys.exit("ERROR: Sanity check on counts has failed") + # print("\nSanity check on counts passed") + + return + + +def concatCriteoAdData( + d_path, + d_file, + npzfile, + trafile, + days, + data_split, + randomize, + total_per_file, + total_count, + memory_map, + o_filename +): + # Concatenates different days and saves the result. + # + # Inputs: + # days (int): total number of days in the dataset (typically 7 or 24) + # d_path (str): path for {kaggle|terabyte}_day_i.npz files + # o_filename (str): output file name + # + # Output: + # o_file (str): output file path + + if memory_map: + # dataset break up per fea + # tar_fea = 1 # single target + den_fea = 13 # 13 dense features + spa_fea = 26 # 26 sparse features + # tad_fea = tar_fea + den_fea + # tot_fea = tad_fea + spa_fea + # create offset per file + offset_per_file = np.array([0] + [x for x in total_per_file]) + for i in range(days): + offset_per_file[i + 1] += offset_per_file[i] + + ''' + # Approach 1, 2 and 3 use indices, while Approach 4 does not use them + # create indices + indices = np.arange(total_count) + if data_split == "none": + if randomize == "total": + indices = np.random.permutation(indices) + else: + indices = np.array_split(indices, offset_per_file[1:-1]) + + # randomize train data (per day) + if randomize == "day": # or randomize == "total": + for i in range(len(indices) - 1): + indices[i] = np.random.permutation(indices[i]) + print("Randomized indices per day ...") + + train_indices = np.concatenate(indices[:-1]) + test_indices = indices[-1] + + # randomize train data (across days) + if randomize == "total": + train_indices = np.random.permutation(train_indices) + print("Randomized indices across days ...") + + indices = np.concatenate((train_indices, test_indices)) + # no reordering + # indices = np.arange(total_count) + ''' + ''' + # Approach 1: simple and slow (no grouping is used) + # check if data already exists + recreate_flag = False + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + if path.exists(filename_j): + print("Using existing " + filename_j) + else: + recreate_flag = True + # load, reorder and concatenate data (memmap all reordered files per feature) + if recreate_flag: + # init reordered files (.npy appended automatically) + z = np.zeros((total_count)) + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered".format(j) + np.save(filename_j, z) + print("Creating " + filename_j) + + for i in range(days): + filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) + with np.load(filename_i) as data: + X_cat_t = np.transpose(data["X_cat"]) + X_int_t = np.transpose(data["X_int"]) + y = data["y"] + size = len(y) + # sanity check + if total_per_file[i] != size: + sys.exit("ERROR: sanity check on number of samples failed") + # setup start and end ranges + start = offset_per_file[i] + end = offset_per_file[i + 1] + # print(filename_i) + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) + + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + fj = np.load(filename_j, mmap_mode='r+') + if j < tar_fea: + fj[indices[start:end]] = y + elif tar_fea <= j and j < tad_fea: + fj[indices[start:end]] = X_int_t[j - tar_fea, :] + else: + fj[indices[start:end]] = X_cat_t[j - tad_fea, :] + del fj + else: + print("Reordered fea files already exist, skipping ...") + + # check if data already exists + recreate_flag = False + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + if path.exists(filename_i): + print("Using existing " + filename_i) + else: + recreate_flag = True + # split reordered data by files (memmap all reordered files per feature) + # on the day boundary del the file object and memmap again + if recreate_flag: + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + size = total_per_file[i] + X_int_t = np.zeros((den_fea, size)) + X_cat_t = np.zeros((spa_fea, size)) + # setup start and end ranges + start = offset_per_file[i] + end = offset_per_file[i + 1] + print("Creating " + filename_i) + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) + + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + fj = np.load(filename_j, mmap_mode='r') + if j < tar_fea: + y = fj[start:end] + elif tar_fea <= j and j < tad_fea: + X_int_t[j - tar_fea, :] = fj[start:end] + else: + X_cat_t[j - tad_fea, :] = fj[start:end] + del fj + + np.savez_compressed( + filename_i, + X_cat=np.transpose(X_cat_t), # transpose of the data + X_int=np.transpose(X_int_t), # transpose of the data + y=y, + ) + else: + print("Reordered day files already exist, skipping ...") + ''' + ''' + # Approach 2: group days + # check if data already exists + recreate_flag = False + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + if path.exists(filename_j): + print("Using existing " + filename_j) + else: + recreate_flag = True + # load, reorder and concatenate data (memmap all reordered files per feature) + if recreate_flag: + # init reordered files (.npy appended automatically) + z = np.zeros((total_count)) + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered".format(j) + np.save(filename_j, z) + print("Creating " + filename_j) + + group_day = 3 # e.g. 8, 4 or 3 + group_num = days // group_day + file_group = [i*group_day for i in range(group_num)] + [days] + for ii in range(group_num): + # for last may be group_size != group_num, therefore reset it below + group_size = file_group[ii + 1] - file_group[ii] + X_cat_t = [0]*group_size + X_int_t = [0]*group_size + y = [0]*group_size + start = [0]*group_size + end = [0]*group_size + for ig in range(group_size): + i = file_group[ii] + ig + filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) + # setup start and end ranges + start[ig] = offset_per_file[i] + end[ig] = offset_per_file[i + 1] + # print(filename_i) + # load a group of files + with np.load(filename_i) as data: + X_cat_t[ig] = np.transpose(data["X_cat"]) + X_int_t[ig] = np.transpose(data["X_int"]) + y[ig] = data["y"] + # sanity check + if total_per_file[i] != len(y[ig]): + sys.exit("ERROR: sanity check on number of samples failed") + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end[ig]-start[ig]) + "=" + str(total_per_file[i])) + + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + fj = np.load(filename_j, mmap_mode='r+') + for ig in range(group_size): + if j < tar_fea: + fj[indices[start[ig]:end[ig]]] = y[ig] + elif tar_fea <= j and j < tad_fea: + fj[indices[start[ig]:end[ig]]] = X_int_t[ig][j - tar_fea, :] + else: + fj[indices[start[ig]:end[ig]]] = X_cat_t[ig][j - tad_fea, :] + del fj + else: + print("Reordered fea files already exist, skipping ...") + + # check if data already exists + recreate_flag = False + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + if path.exists(filename_i): + print("Using existing " + filename_i) + else: + recreate_flag = True + # split reordered data by files (memmap all reordered files per feature) + # on the day boundary del the file object and memmap again + if recreate_flag: + for ii in range(group_num): + # for last may be group_size != group_num, therefore reset it below + group_size = file_group[ii + 1] - file_group[ii] + X_cat_t= []; X_int_t = [] + for ig in range(group_size): + i = file_group[ii] + ig + X_int_t.append(np.zeros((den_fea, total_per_file[i]))) + X_cat_t.append(np.zeros((spa_fea, total_per_file[i]))) + y = [0]*group_size + start = [0]*group_size + end = [0]*group_size + + for j in range(tot_fea): + filename_j = trafile + "_{0}_reordered.npy".format(j) + fj = np.load(filename_j, mmap_mode='r') + # load a group of files + for ig in range(group_size): + i = file_group[ii] + ig + # setup start and end ranges + start[ig] = offset_per_file[i] + end[ig] = offset_per_file[i + 1] + # load data for the group of files + if j < tar_fea: + y[ig] = fj[start[ig]:end[ig]] + elif tar_fea <= j and j < tad_fea: + X_int_t[ig][j - tar_fea, :] = fj[start[ig]:end[ig]] + else: + X_cat_t[ig][j - tad_fea, :] = fj[start[ig]:end[ig]] + del fj + + for ig in range(group_size): + i = file_group[ii] + ig + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + print("Creating " + filename_i) + np.savez_compressed( + filename_i, + X_cat=np.transpose(X_cat_t[ig]), # transpose of the data + X_int=np.transpose(X_int_t[ig]), # transpose of the data + y=y[ig], + ) + else: + print("Reordered day files already exist, skipping ...") + ''' + ''' + # Approach 3: group features + # check if data already exists + group_fea = 5 # e.g. 8, 5 or 4 + group_num = tot_fea // group_fea + if tot_fea % group_fea != 0: # sanity check + sys.exit("ERROR: the group_fea must divided tot_fea evenly.") + recreate_flag = False + for jn in range(group_num): + filename_j = trafile + "_{0}_reordered{1}.npy".format( + jn, group_fea + ) + if path.exists(filename_j): + print("Using existing " + filename_j) + else: + recreate_flag = True + # load, reorder and concatenate data (memmap all reordered files per feature) + if recreate_flag: + # init reordered files (.npy appended automatically) + z = np.zeros((group_fea, total_count)) + for jn in range(group_num): + filename_j = trafile + "_{0}_reordered{1}".format( + jn, group_fea + ) + np.save(filename_j, z) + print("Creating " + filename_j) + + for i in range(days): + filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) + with np.load(filename_i) as data: + X_cat_t = np.transpose(data["X_cat"]) + X_int_t = np.transpose(data["X_int"]) + y = data["y"] + size = len(y) + # sanity check + if total_per_file[i] != size: + sys.exit("ERROR: sanity check on number of samples failed") + # setup start and end ranges + start = offset_per_file[i] + end = offset_per_file[i + 1] + # print(filename_i) + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) + + for jn in range(group_num): + filename_j = trafile + "_{0}_reordered{1}.npy".format( + jn, group_fea + ) + fj = np.load(filename_j, mmap_mode='r+') + for jg in range(group_fea): + j = jn * group_fea + jg + # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg)) + if j < tar_fea: + fj[jg, indices[start:end]] = y + elif tar_fea <= j and j < tad_fea: + fj[jg, indices[start:end]] = X_int_t[j - tar_fea, :] + else: + fj[jg, indices[start:end]] = X_cat_t[j - tad_fea, :] + del fj + else: + print("Reordered fea files already exist, skipping ...") + + # check if data already exists + recreate_flag = False + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + if path.exists(filename_i): + print("Using existing" + filename_i) + else: + recreate_flag = True + # split reordered data by files (memmap all reordered files per feature) + # on the day boundary del the file object and memmap again + if recreate_flag: + for i in range(days): + filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) + size = total_per_file[i] + X_int_t = np.zeros((den_fea, size)) + X_cat_t = np.zeros((spa_fea, size)) + # setup start and end ranges + start = offset_per_file[i] + end = offset_per_file[i + 1] + print("Creating " + filename_i) + # print("start=" + str(start) + " end=" + str(end) + # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) + + for jn in range(group_num): + filename_j = trafile + "_{0}_reordered{1}.npy".format( + jn, group_fea + ) + fj = np.load(filename_j, mmap_mode='r') + for jg in range(group_fea): + j = jn * group_fea + jg + # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg)) + if j < tar_fea: + y = fj[jg, start:end] + elif tar_fea <= j and j < tad_fea: + X_int_t[j - tar_fea, :] = fj[jg, start:end] + else: + X_cat_t[j - tad_fea, :] = fj[jg, start:end] + del fj + + np.savez_compressed( + filename_i, + X_cat=np.transpose(X_cat_t), # transpose of the data + X_int=np.transpose(X_int_t), # transpose of the data + y=y, + ) + + else: + print("Reordered day files already exist, skipping ...") + ''' + + # Approach 4: Fisher-Yates-Rao (FYR) shuffle algorithm + # 1st pass of FYR shuffle + # check if data already exists + recreate_flag = False + for j in range(days): + filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) + filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) + filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) + if ( + path.exists(filename_j_y) + and path.exists(filename_j_d) + and path.exists(filename_j_s) + ): + print( + "Using existing\n" + + filename_j_y + "\n" + + filename_j_d + "\n" + + filename_j_s + ) + else: + recreate_flag = True + # reorder across buckets using sampling + if recreate_flag: + # init intermediate files (.npy appended automatically) + for j in range(days): + filename_j_y = npzfile + "_{0}_intermediate_y".format(j) + filename_j_d = npzfile + "_{0}_intermediate_d".format(j) + filename_j_s = npzfile + "_{0}_intermediate_s".format(j) + np.save(filename_j_y, np.zeros((total_per_file[j]))) + np.save(filename_j_d, np.zeros((total_per_file[j], den_fea))) + np.save(filename_j_s, np.zeros((total_per_file[j], spa_fea))) + # start processing files + total_counter = [0] * days + for i in range(days): + filename_i = npzfile + "_{0}_processed.npz".format(i) + with np.load(filename_i) as data: + X_cat = data["X_cat"] + X_int = data["X_int"] + y = data["y"] + size = len(y) + # sanity check + if total_per_file[i] != size: + sys.exit("ERROR: sanity check on number of samples failed") + # debug prints + print("Reordering (1st pass) " + filename_i) + + # create buckets using sampling of random ints + # from (discrete) uniform distribution + buckets = [] + for _j in range(days): + buckets.append([]) + counter = [0] * days + days_to_sample = days if data_split == "none" else days - 1 + if randomize == "total": + rand_u = np.random.randint(low=0, high=days_to_sample, size=size) + for k in range(size): + # sample and make sure elements per buckets do not overflow + if data_split == "none" or i < days - 1: + # choose bucket + p = rand_u[k] + # retry of the bucket is full + while total_counter[p] + counter[p] >= total_per_file[p]: + p = np.random.randint(low=0, high=days_to_sample) + else: # preserve the last day/bucket if needed + p = i + buckets[p].append(k) + counter[p] += 1 + else: # randomize is day or none + for k in range(size): + # do not sample, preserve the data in this bucket + p = i + buckets[p].append(k) + counter[p] += 1 + + # sanity check + if np.sum(counter) != size: + sys.exit("ERROR: sanity check on number of samples failed") + # debug prints + # print(counter) + # print(str(np.sum(counter)) + " = " + str(size)) + # print([len(x) for x in buckets]) + # print(total_counter) + + # partially feel the buckets + for j in range(days): + filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) + filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) + filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) + start = total_counter[j] + end = total_counter[j] + counter[j] + # target buckets + fj_y = np.load(filename_j_y, mmap_mode='r+') + # print("start=" + str(start) + " end=" + str(end) + # + " end - start=" + str(end - start) + " " + # + str(fj_y[start:end].shape) + " " + # + str(len(buckets[j]))) + fj_y[start:end] = y[buckets[j]] + del fj_y + # dense buckets + fj_d = np.load(filename_j_d, mmap_mode='r+') + # print("start=" + str(start) + " end=" + str(end) + # + " end - start=" + str(end - start) + " " + # + str(fj_d[start:end, :].shape) + " " + # + str(len(buckets[j]))) + fj_d[start:end, :] = X_int[buckets[j], :] + del fj_d + # sparse buckets + fj_s = np.load(filename_j_s, mmap_mode='r+') + # print("start=" + str(start) + " end=" + str(end) + # + " end - start=" + str(end - start) + " " + # + str(fj_s[start:end, :].shape) + " " + # + str(len(buckets[j]))) + fj_s[start:end, :] = X_cat[buckets[j], :] + del fj_s + # update counters for next step + total_counter[j] += counter[j] + + # 2nd pass of FYR shuffle + # check if data already exists + for j in range(days): + filename_j = npzfile + "_{0}_reordered.npz".format(j) + if path.exists(filename_j): + print("Using existing " + filename_j) + else: + recreate_flag = True + # reorder within buckets + if recreate_flag: + for j in range(days): + filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) + filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) + filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) + fj_y = np.load(filename_j_y) + fj_d = np.load(filename_j_d) + fj_s = np.load(filename_j_s) + + indices = range(total_per_file[j]) + if randomize == "day" or randomize == "total": + if data_split == "none" or j < days - 1: + indices = np.random.permutation(range(total_per_file[j])) + + filename_r = npzfile + "_{0}_reordered.npz".format(j) + print("Reordering (2nd pass) " + filename_r) + np.savez_compressed( + filename_r, + X_cat=fj_s[indices, :], + X_int=fj_d[indices, :], + y=fj_y[indices], + ) + + ''' + # sanity check (under no reordering norms should be zero) + for i in range(days): + filename_i_o = npzfile + "_{0}_processed.npz".format(i) + print(filename_i_o) + with np.load(filename_i_o) as data_original: + X_cat_o = data_original["X_cat"] + X_int_o = data_original["X_int"] + y_o = data_original["y"] + filename_i_r = npzfile + "_{0}_reordered.npz".format(i) + print(filename_i_r) + with np.load(filename_i_r) as data_reordered: + X_cat_r = data_reordered["X_cat"] + X_int_r = data_reordered["X_int"] + y_r = data_reordered["y"] + print(np.linalg.norm(y_o - y_r)) + print(np.linalg.norm(X_int_o - X_int_r)) + print(np.linalg.norm(X_cat_o - X_cat_r)) + ''' + + else: + print("Concatenating multiple days into %s.npz file" % str(d_path + o_filename)) + + # load and concatenate data + for i in range(days): + filename_i = npzfile + "_{0}_processed.npz".format(i) + with np.load(filename_i) as data: + if i == 0: + X_cat = data["X_cat"] + X_int = data["X_int"] + y = data["y"] + else: + X_cat = np.concatenate((X_cat, data["X_cat"])) + X_int = np.concatenate((X_int, data["X_int"])) + y = np.concatenate((y, data["y"])) + print("Loaded day:", i, "y = 1:", len(y[y == 1]), "y = 0:", len(y[y == 0])) + + with np.load(d_path + d_file + "_fea_count.npz") as data: + counts = data["counts"] + print("Loaded counts!") + + np.savez_compressed( + d_path + o_filename + ".npz", + X_cat=X_cat, + X_int=X_int, + y=y, + counts=counts, + ) + + return d_path + o_filename + ".npz" + + +def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file): + # Transforms Criteo Kaggle or terabyte data by applying log transformation + # on dense features and converting everything to appropriate tensors. + # + # Inputs: + # X_cat (ndarray): array of integers corresponding to preprocessed + # categorical features + # X_int (ndarray): array of integers corresponding to dense features + # y (ndarray): array of bool corresponding to labels + # data_split(str): flag for splitting dataset into training/validation/test + # sets + # randomize (str): determines randomization scheme + # "none": no randomization + # "day": randomizes each day"s data (only works if split = True) + # "total": randomizes total dataset + # + # Outputs: + # if split: + # X_cat_train (tensor): sparse features for training set + # X_int_train (tensor): dense features for training set + # y_train (tensor): labels for training set + # X_cat_val (tensor): sparse features for validation set + # X_int_val (tensor): dense features for validation set + # y_val (tensor): labels for validation set + # X_cat_test (tensor): sparse features for test set + # X_int_test (tensor): dense features for test set + # y_test (tensor): labels for test set + # else: + # X_cat (tensor): sparse features + # X_int (tensor): dense features + # y (tensor): label + + # define initial set of indices + indices = np.arange(len(y)) + + # create offset per file + offset_per_file = np.array([0] + [x for x in total_per_file]) + for i in range(days): + offset_per_file[i + 1] += offset_per_file[i] + + # split dataset + if data_split == 'train': + indices = np.array_split(indices, offset_per_file[1:-1]) + + # randomize train data (per day) + if randomize == "day": # or randomize == "total": + for i in range(len(indices) - 1): + indices[i] = np.random.permutation(indices[i]) + print("Randomized indices per day ...") + + train_indices = np.concatenate(indices[:-1]) + test_indices = indices[-1] + test_indices, val_indices = np.array_split(test_indices, 2) + + print("Defined training and testing indices...") + + # randomize train data (across days) + if randomize == "total": + train_indices = np.random.permutation(train_indices) + print("Randomized indices across days ...") + + # indices = np.concatenate((train_indices, test_indices)) + + # create training, validation, and test sets + X_cat_train = X_cat[train_indices] + X_int_train = X_int[train_indices] + y_train = y[train_indices] + + X_cat_val = X_cat[val_indices] + X_int_val = X_int[val_indices] + y_val = y[val_indices] + + X_cat_test = X_cat[test_indices] + X_int_test = X_int[test_indices] + y_test = y[test_indices] + + print("Split data according to indices...") + + X_cat_train = X_cat_train.astype(np.long) + X_int_train = np.log(X_int_train.astype(np.float32) + 1) + y_train = y_train.astype(np.float32) + + X_cat_val = X_cat_val.astype(np.long) + X_int_val = np.log(X_int_val.astype(np.float32) + 1) + y_val = y_val.astype(np.float32) + + X_cat_test = X_cat_test.astype(np.long) + X_int_test = np.log(X_int_test.astype(np.float32) + 1) + y_test = y_test.astype(np.float32) + + print("Converted to tensors...done!") + + return ( + X_cat_train, + X_int_train, + y_train, + X_cat_val, + X_int_val, + y_val, + X_cat_test, + X_int_test, + y_test, + ) + + else: + + # randomize data + if randomize == "total": + indices = np.random.permutation(indices) + print("Randomized indices...") + + X_cat = X_cat[indices].astype(np.long) + X_int = np.log(X_int[indices].astype(np.float32) + 1) + y = y[indices].astype(np.float32) + + print("Converted to tensors...done!") + + return (X_cat, X_int, y, [], [], [], [], [], []) + + +def getCriteoAdData( + datafile, + o_filename, + max_ind_range=-1, + sub_sample_rate=0.0, + days=7, + data_split='train', + randomize='total', + criteo_kaggle=True, + memory_map=False, + dataset_multiprocessing=False, +): + # Passes through entire dataset and defines dictionaries for categorical + # features and determines the number of total categories. + # + # Inputs: + # datafile : path to downloaded raw data file + # o_filename (str): saves results under o_filename if filename is not "" + # + # Output: + # o_file (str): output file path + + #split the datafile into path and filename + lstr = datafile.split("/") + d_path = "/".join(lstr[0:-1]) + "/" + d_file = lstr[-1].split(".")[0] if criteo_kaggle else lstr[-1] + npzfile = d_path + ((d_file + "_day") if criteo_kaggle else d_file) + trafile = d_path + ((d_file + "_fea") if criteo_kaggle else "fea") + + # count number of datapoints in training set + total_file = d_path + d_file + "_day_count.npz" + if path.exists(total_file): + with np.load(total_file) as data: + total_per_file = list(data["total_per_file"]) + total_count = np.sum(total_per_file) + print("Skipping counts per file (already exist)") + else: + total_count = 0 + total_per_file = [] + if criteo_kaggle: + # WARNING: The raw data consists of a single train.txt file + # Each line in the file is a sample, consisting of 13 continuous and + # 26 categorical features (an extra space indicates that feature is + # missing and will be interpreted as 0). + if path.exists(datafile): + print("Reading data from path=%s" % (datafile)) + with open(str(datafile)) as f: + for _ in f: + total_count += 1 + total_per_file.append(total_count) + # reset total per file due to split + num_data_per_split, extras = divmod(total_count, days) + total_per_file = [num_data_per_split] * days + for j in range(extras): + total_per_file[j] += 1 + # split into days (simplifies code later on) + file_id = 0 + boundary = total_per_file[file_id] + nf = open(npzfile + "_" + str(file_id), "w") + with open(str(datafile)) as f: + for j, line in enumerate(f): + if j == boundary: + nf.close() + file_id += 1 + nf = open(npzfile + "_" + str(file_id), "w") + boundary += total_per_file[file_id] + nf.write(line) + nf.close() + else: + sys.exit("ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset") + else: + # WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files + # Each line in the file is a sample, consisting of 13 continuous and + # 26 categorical features (an extra space indicates that feature is + # missing and will be interpreted as 0). + for i in range(days): + datafile_i = datafile + "_" + str(i) # + ".gz" + if path.exists(str(datafile_i)): + print("Reading data from path=%s" % (str(datafile_i))) + # file day_ + total_per_file_count = 0 + with open(str(datafile_i)) as f: + for _ in f: + total_per_file_count += 1 + total_per_file.append(total_per_file_count) + total_count += total_per_file_count + else: + sys.exit("ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs") + + # process a file worth of data and reinitialize data + # note that a file main contain a single or multiple splits + def process_one_file( + datfile, + npzfile, + split, + num_data_in_split, + dataset_multiprocessing, + convertDictsDay=None, + resultDay=None + ): + if dataset_multiprocessing: + convertDicts_day = [{} for _ in range(26)] + + with open(str(datfile)) as f: + y = np.zeros(num_data_in_split, dtype="i4") # 4 byte int + X_int = np.zeros((num_data_in_split, 13), dtype="i4") # 4 byte int + X_cat = np.zeros((num_data_in_split, 26), dtype="i4") # 4 byte int + if sub_sample_rate == 0.0: + rand_u = 1.0 + else: + rand_u = np.random.uniform(low=0.0, high=1.0, size=num_data_in_split) + + i = 0 + percent = 0 + for k, line in enumerate(f): + # process a line (data point) + line = line.split('\t') + # set missing values to zero + for j in range(len(line)): + if (line[j] == '') or (line[j] == '\n'): + line[j] = '0' + # sub-sample data by dropping zero targets, if needed + target = np.int32(line[0]) + if target == 0 and \ + (rand_u if sub_sample_rate == 0.0 else rand_u[k]) < sub_sample_rate: + continue + + y[i] = target + X_int[i] = np.array(line[1:14], dtype=np.int32) + if max_ind_range > 0: + X_cat[i] = np.array( + list(map(lambda x: int(x, 16) % max_ind_range, line[14:])), + dtype=np.int32 + ) + else: + X_cat[i] = np.array( + list(map(lambda x: int(x, 16), line[14:])), + dtype=np.int32 + ) + + # count uniques + if dataset_multiprocessing: + for j in range(26): + convertDicts_day[j][X_cat[i][j]] = 1 + # debug prints + if float(i)/num_data_in_split*100 > percent+1: + percent = int(float(i)/num_data_in_split*100) + print( + "Load %d/%d (%d%%) Split: %d Label True: %d Stored: %d" + % ( + i, + num_data_in_split, + percent, + split, + target, + y[i], + ), + end="\n", + ) + else: + for j in range(26): + convertDicts[j][X_cat[i][j]] = 1 + # debug prints + print( + "Load %d/%d Split: %d Label True: %d Stored: %d" + % ( + i, + num_data_in_split, + split, + target, + y[i], + ), + end="\r", + ) + i += 1 + + # store num_data_in_split samples or extras at the end of file + # count uniques + # X_cat_t = np.transpose(X_cat) + # for j in range(26): + # for x in X_cat_t[j,:]: + # convertDicts[j][x] = 1 + # store parsed + filename_s = npzfile + "_{0}.npz".format(split) + if path.exists(filename_s): + print("\nSkip existing " + filename_s) + else: + np.savez_compressed( + filename_s, + X_int=X_int[0:i, :], + # X_cat=X_cat[0:i, :], + X_cat_t=np.transpose(X_cat[0:i, :]), # transpose of the data + y=y[0:i], + ) + print("\nSaved " + npzfile + "_{0}.npz!".format(split)) + + if dataset_multiprocessing: + resultDay[split] = i + convertDictsDay[split] = convertDicts_day + return + else: + return i + + # create all splits (reuse existing files if possible) + recreate_flag = False + convertDicts = [{} for _ in range(26)] + # WARNING: to get reproducable sub-sampling results you must reset the seed below + # np.random.seed(123) + # in this case there is a single split in each day + for i in range(days): + npzfile_i = npzfile + "_{0}.npz".format(i) + npzfile_p = npzfile + "_{0}_processed.npz".format(i) + if path.exists(npzfile_i): + print("Skip existing " + npzfile_i) + elif path.exists(npzfile_p): + print("Skip existing " + npzfile_p) + else: + recreate_flag = True + + if recreate_flag: + if dataset_multiprocessing: + resultDay = Manager().dict() + convertDictsDay = Manager().dict() + processes = [Process(target=process_one_file, + name="process_one_file:%i" % i, + args=(npzfile + "_{0}".format(i), + npzfile, + i, + total_per_file[i], + dataset_multiprocessing, + convertDictsDay, + resultDay, + ) + ) for i in range(0, days)] + for process in processes: + process.start() + for process in processes: + process.join() + for day in range(days): + total_per_file[day] = resultDay[day] + print("Constructing convertDicts Split: {}".format(day)) + convertDicts_tmp = convertDictsDay[day] + for i in range(26): + for j in convertDicts_tmp[i]: + convertDicts[i][j] = 1 + else: + for i in range(days): + total_per_file[i] = process_one_file( + npzfile + "_{0}".format(i), + npzfile, + i, + total_per_file[i], + dataset_multiprocessing, + ) + + # report and save total into a file + total_count = np.sum(total_per_file) + if not path.exists(total_file): + np.savez_compressed(total_file, total_per_file=total_per_file) + print("Total number of samples:", total_count) + print("Divided into days/splits:\n", total_per_file) + + # dictionary files + counts = np.zeros(26, dtype=np.int32) + if recreate_flag: + # create dictionaries + for j in range(26): + for i, x in enumerate(convertDicts[j]): + convertDicts[j][x] = i + dict_file_j = d_path + d_file + "_fea_dict_{0}.npz".format(j) + if not path.exists(dict_file_j): + np.savez_compressed( + dict_file_j, + unique=np.array(list(convertDicts[j]), dtype=np.int32) + ) + counts[j] = len(convertDicts[j]) + # store (uniques and) counts + count_file = d_path + d_file + "_fea_count.npz" + if not path.exists(count_file): + np.savez_compressed(count_file, counts=counts) + else: + # create dictionaries (from existing files) + for j in range(26): + with np.load(d_path + d_file + "_fea_dict_{0}.npz".format(j)) as data: + unique = data["unique"] + for i, x in enumerate(unique): + convertDicts[j][x] = i + # load (uniques and) counts + with np.load(d_path + d_file + "_fea_count.npz") as data: + counts = data["counts"] + + # process all splits + if dataset_multiprocessing: + processes = [Process(target=processCriteoAdData, + name="processCriteoAdData:%i" % i, + args=(d_path, + d_file, + npzfile, + i, + convertDicts, + counts, + ) + ) for i in range(0, days)] + for process in processes: + process.start() + for process in processes: + process.join() + + else: + for i in range(days): + processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, counts) + + o_file = concatCriteoAdData( + d_path, + d_file, + npzfile, + trafile, + days, + data_split, + randomize, + total_per_file, + total_count, + memory_map, + o_filename + ) + + return o_file + + +def loadDataset( + dataset, + max_ind_range, + sub_sample_rate, + randomize, + data_split, + raw_path="", + pro_data="", + memory_map=False +): + # dataset + if dataset == "kaggle": + days = 7 + o_filename = "kaggleAdDisplayChallenge_processed" + elif dataset == "terabyte": + days = 24 + o_filename = "terabyte_processed" + else: + raise(ValueError("Data set option is not supported")) + + # split the datafile into path and filename + lstr = raw_path.split("/") + d_path = "/".join(lstr[0:-1]) + "/" + d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] + npzfile = (d_file + "_day") if dataset == "kaggle" else d_file + # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea") + + # check if pre-processed data is available + data_ready = True + if memory_map: + for i in range(days): + reo_data = d_path + npzfile + "_{0}_reordered.npz".format(i) + if not path.exists(str(reo_data)): + data_ready = False + else: + if not path.exists(str(pro_data)): + data_ready = False + + # pre-process data if needed + # WARNNING: when memory mapping is used we get a collection of files + if data_ready: + print("Reading pre-processed data=%s" % (str(pro_data))) + file = str(pro_data) + else: + print("Reading raw data=%s" % (str(raw_path))) + file = getCriteoAdData( + raw_path, + o_filename, + max_ind_range, + sub_sample_rate, + days, + data_split, + randomize, + dataset == "kaggle", + memory_map + ) + + return file, days + + +if __name__ == "__main__": + ### import packages ### + import argparse + + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Preprocess Criteo dataset" + ) + # model related parameters + parser.add_argument("--max-ind-range", type=int, default=-1) + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--data-randomize", type=str, default="total") # or day or none + parser.add_argument("--memory-map", action="store_true", default=False) + parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + args = parser.parse_args() + + loadDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map + ) diff --git a/benchmarks/dlrm/ootb/dlrm_data_caffe2.py b/benchmarks/dlrm/ootb/dlrm_data_caffe2.py new file mode 100644 index 0000000..0bda2ac --- /dev/null +++ b/benchmarks/dlrm/ootb/dlrm_data_caffe2.py @@ -0,0 +1,843 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: generate inputs and targets for the dlrm benchmark +# The inpts and outputs are generated according to the following three option(s) +# 1) random distribution +# 2) synthetic distribution, based on unique accesses and distances between them +# i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven +# Simulation of Cache Memory", IEEE AINAM'07 +# 3) public data set +# i) Criteo Kaggle Display Advertising Challenge Dataset +# https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset +# ii) Criteo Terabyte Dataset +# https://labs.criteo.com/2013/12/download-terabyte-click-logs + + +from __future__ import absolute_import, division, print_function, unicode_literals + +import bisect +import collections + +# others +# from os import path +import sys + +import data_utils + +# numpy +import numpy as np + +# pytorch +import torch +from numpy import random as ra +from torch.utils.data import Dataset + + +# Kaggle Display Advertising Challenge Dataset +# dataset (str): name of dataset (Kaggle or Terabyte) +# randomize (str): determines randomization scheme +# 'none': no randomization +# 'day': randomizes each day's data (only works if split = True) +# 'total': randomizes total dataset +# split (bool) : to split into train, test, validation data-sets + + +class CriteoDatasetWMemoryMap(Dataset): + def __init__( + self, + dataset, + max_ind_range, + sub_sample_rate, + randomize, + split="train", + raw_path="", + pro_data="", + ): + # dataset + # tar_fea = 1 # single target + den_fea = 13 # 13 dense features + # spa_fea = 26 # 26 sparse features + # tad_fea = tar_fea + den_fea + # tot_fea = tad_fea + spa_fea + if dataset == "kaggle": + days = 7 + elif dataset == "terabyte": + days = 24 + else: + raise (ValueError("Data set option is not supported")) + self.max_ind_range = max_ind_range + + # split the datafile into path and filename + lstr = raw_path.split("/") + self.d_path = "/".join(lstr[0:-1]) + "/" + self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] + self.npzfile = self.d_path + ( + (self.d_file + "_day") if dataset == "kaggle" else self.d_file + ) + self.trafile = self.d_path + ( + (self.d_file + "_fea") if dataset == "kaggle" else "fea" + ) + + # get a number of samples per day + total_file = self.d_path + self.d_file + "_day_count.npz" + with np.load(total_file) as data: + total_per_file = data["total_per_file"] + # compute offsets per file + self.offset_per_file = np.array([0] + list(total_per_file)) + for i in range(days): + self.offset_per_file[i + 1] += self.offset_per_file[i] + # print(self.offset_per_file) + + # setup data + self.split = split + if split == "none" or split == "train": + self.day = 0 + self.max_day_range = days if split == "none" else days - 1 + elif split == "test" or split == "val": + self.day = days - 1 + num_samples = self.offset_per_file[days] - self.offset_per_file[days - 1] + self.test_size = int(np.ceil(num_samples / 2.0)) + self.val_size = num_samples - self.test_size + else: + sys.exit("ERROR: dataset split is neither none, nor train or test.") + + # load unique counts + with np.load(self.d_path + self.d_file + "_fea_count.npz") as data: + self.counts = data["counts"] + self.m_den = den_fea # X_int.shape[1] + self.n_emb = len(self.counts) + print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den)) + + # Load the test data + # Only a single day is used for testing + if self.split == "test" or self.split == "val": + # only a single day is used for testing + fi = self.npzfile + "_{0}_reordered.npz".format(self.day) + with np.load(fi) as data: + self.X_int = data["X_int"] # continuous feature + self.X_cat = data["X_cat"] # categorical feature + self.y = data["y"] # target + + def __getitem__(self, index): + + if isinstance(index, slice): + return [ + self[idx] + for idx in range( + index.start or 0, index.stop or len(self), index.step or 1 + ) + ] + if self.split == "none" or self.split == "train": + # check if need to swicth to next day and load data + if index == self.offset_per_file[self.day]: + # print("day_boundary switch", index) + self.day_boundary = self.offset_per_file[self.day] + fi = self.npzfile + "_{0}_reordered.npz".format(self.day) + # print('Loading file: ', fi) + with np.load(fi) as data: + self.X_int = data["X_int"] # continuous feature + self.X_cat = data["X_cat"] # categorical feature + self.y = data["y"] # target + self.day = (self.day + 1) % self.max_day_range + + i = index - self.day_boundary + elif self.split == "test" or self.split == "val": + # only a single day is used for testing + i = index + (0 if self.split == "test" else self.test_size) + else: + sys.exit("ERROR: dataset split is neither none, nor train or test.") + + if self.max_ind_range > 0: + return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i] + else: + return self.X_int[i], self.X_cat[i], self.y[i] + + def _default_preprocess(self, X_int, X_cat, y): + X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1) + if self.max_ind_range > 0: + X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long) + else: + X_cat = torch.tensor(X_cat, dtype=torch.long) + y = torch.tensor(y.astype(np.float32)) + + return X_int, X_cat, y + + def __len__(self): + if self.split == "none": + return self.offset_per_file[-1] + elif self.split == "train": + return self.offset_per_file[-2] + elif self.split == "test": + return self.test_size + elif self.split == "val": + return self.val_size + else: + sys.exit("ERROR: dataset split is neither none, nor train nor test.") + + +def collate_wrapper_criteo(list_of_tuples): + # where each tuple is (X_int, X_cat, y) + transposed_data = list(zip(*list_of_tuples)) + X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1) + X_cat = torch.tensor(transposed_data[1], dtype=torch.long) + T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1) + + batchSize = X_cat.shape[0] + featureCnt = X_cat.shape[1] + + lS_i = [X_cat[:, i] for i in range(featureCnt)] + lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)] + + return X_int, torch.stack(lS_o), torch.stack(lS_i), T + + +# Conversion from offset to length +def offset_to_length_convertor(lS_o, lS_i): + def diff(tensor): + return tensor[1:] - tensor[:-1] + + return torch.stack( + [ + diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int()) + for ind, S_o in enumerate(lS_o) + ] + ) + + +def unpack_batch(b, data_gen, data_set): + return b[0], b[1], b[2], b[3], torch.ones(b[3].size()) + + +def read_dataset( + dataset, + max_ind_range, + sub_sample_rate, + mini_batch_size, + num_batches, + randomize, + split="train", + raw_data="", + processed_data="", + memory_map=False, + inference_only=False, + test_mini_batch_size=1, +): + # split the datafile into path and filename + lstr = raw_data.split("/") + d_path = "/".join(lstr[0:-1]) + "/" + d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] + # npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file) + # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea") + + # load + print("Loading %s dataset..." % dataset) + nbatches = 0 + file, days = data_utils.loadDataset( + dataset, + max_ind_range, + sub_sample_rate, + randomize, + split, + raw_data, + processed_data, + memory_map, + ) + + if memory_map: + # WARNING: at this point the data has been reordered and shuffled across files + # e.g. day__reordered.npz, what remains is simply to read and feed + # the data from each file, going in the order of days file-by-file, to the + # model during training. + train_data = CriteoDatasetWMemoryMap( + dataset, + max_ind_range, + sub_sample_rate, + randomize, + "train", + raw_data, + processed_data, + ) + + test_data = CriteoDatasetWMemoryMap( + dataset, + max_ind_range, + sub_sample_rate, + randomize, + "test", + raw_data, + processed_data, + ) + + train_loader = torch.utils.data.DataLoader( + train_data, + batch_size=mini_batch_size, + shuffle=False, + num_workers=0, + collate_fn=collate_wrapper_criteo, + pin_memory=False, + drop_last=False, # True + ) + + test_loader = torch.utils.data.DataLoader( + test_data, + batch_size=test_mini_batch_size, + shuffle=False, + num_workers=0, + collate_fn=collate_wrapper_criteo, + pin_memory=False, + drop_last=False, # True + ) + + return train_data, train_loader, test_data, test_loader + + else: + # load and preprocess data + with np.load(file) as data: + X_int = data["X_int"] + X_cat = data["X_cat"] + y = data["y"] + counts = data["counts"] + + # get a number of samples per day + total_file = d_path + d_file + "_day_count.npz" + with np.load(total_file) as data: + total_per_file = data["total_per_file"] + + # transform + ( + X_cat_train, + X_int_train, + y_train, + X_cat_val, + X_int_val, + y_val, + X_cat_test, + X_int_test, + y_test, + ) = data_utils.transformCriteoAdData( + X_cat, X_int, y, days, split, randomize, total_per_file + ) + ln_emb = counts + m_den = X_int_train.shape[1] + n_emb = len(counts) + print("Sparse features = %d, Dense features = %d" % (n_emb, m_den)) + + # adjust parameters + def assemble_samples(X_cat, X_int, y, max_ind_range, print_message): + if max_ind_range > 0: + X_cat = X_cat % max_ind_range + + nsamples = len(y) + data_size = nsamples + # using floor is equivalent to dropping last mini-batch (drop_last = True) + nbatches = int(np.floor((data_size * 1.0) / mini_batch_size)) + print(print_message) + if num_batches != 0 and num_batches < nbatches: + print( + "Limiting to %d batches of the total % d batches" + % (num_batches, nbatches) + ) + nbatches = num_batches + else: + print("Total number of batches %d" % nbatches) + + # data main loop + lX = [] + lS_lengths = [] + lS_indices = [] + lT = [] + for j in range(0, nbatches): + # number of data points in a batch + print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r") + n = min(mini_batch_size, data_size - (j * mini_batch_size)) + # dense feature + idx_start = j * mini_batch_size + lX.append((X_int[idx_start : (idx_start + n)]).astype(np.float32)) + # Targets - outputs + lT.append( + (y[idx_start : idx_start + n]).reshape(-1, 1).astype(np.int32) + ) + # sparse feature (sparse indices) + lS_emb_indices = [] + # for each embedding generate a list of n lookups, + # where each lookup is composed of multiple sparse indices + for size in range(n_emb): + lS_batch_indices = [] + for _b in range(n): + # num of sparse indices to be used per embedding, e.g. for + # store lengths and indices + lS_batch_indices += ( + (X_cat[idx_start + _b][size].reshape(-1)).astype(np.int32) + ).tolist() + lS_emb_indices.append(lS_batch_indices) + lS_indices.append(lS_emb_indices) + # Criteo Kaggle data it is 1 because data is categorical + lS_lengths.append( + [(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)] + ) + print("\n") + + return nbatches, lX, lS_lengths, lS_indices, lT + + # adjust training data + (nbatches, lX, lS_lengths, lS_indices, lT) = assemble_samples( + X_cat_train, X_int_train, y_train, max_ind_range, "Training data" + ) + + # adjust testing data + (nbatches_t, lX_t, lS_lengths_t, lS_indices_t, lT_t) = assemble_samples( + X_cat_test, X_int_test, y_test, max_ind_range, "Testing data" + ) + # end if memory_map + + return ( + nbatches, + lX, + lS_lengths, + lS_indices, + lT, + nbatches_t, + lX_t, + lS_lengths_t, + lS_indices_t, + lT_t, + ln_emb, + m_den, + ) + + +def generate_random_data( + m_den, + ln_emb, + data_size, + num_batches, + mini_batch_size, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + num_targets=1, + round_targets=False, + data_generation="random", + trace_file="", + enable_padding=False, +): + nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size)) + if num_batches != 0: + nbatches = num_batches + data_size = nbatches * mini_batch_size + # print("Total number of batches %d" % nbatches) + + # inputs and targets + lT = [] + lX = [] + lS_lengths = [] + lS_indices = [] + for j in range(0, nbatches): + # number of data points in a batch + n = min(mini_batch_size, data_size - (j * mini_batch_size)) + + # generate a batch of dense and sparse features + if data_generation == "random": + (Xt, lS_emb_lengths, lS_emb_indices) = generate_uniform_input_batch( + m_den, ln_emb, n, num_indices_per_lookup, num_indices_per_lookup_fixed + ) + elif data_generation == "synthetic": + (Xt, lS_emb_lengths, lS_emb_indices) = generate_synthetic_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + trace_file, + enable_padding, + ) + else: + sys.exit( + "ERROR: --data-generation=" + data_generation + " is not supported" + ) + # dense feature + lX.append(Xt) + # sparse feature (sparse indices) + lS_lengths.append(lS_emb_lengths) + lS_indices.append(lS_emb_indices) + + # generate a batch of target (probability of a click) + P = generate_random_output_batch(n, num_targets, round_targets) + lT.append(P) + + return (nbatches, lX, lS_lengths, lS_indices, lT) + + +def generate_random_output_batch(n, num_targets=1, round_targets=False): + # target (probability of a click) + if round_targets: + P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.int32) + else: + P = ra.rand(n, num_targets).astype(np.float32) + + return P + + +# uniform ditribution (input data) +def generate_uniform_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, +): + # dense feature + Xt = ra.rand(n, m_den).astype(np.float32) + + # sparse feature (sparse indices) + lS_emb_lengths = [] + lS_emb_indices = [] + # for each embedding generate a list of n lookups, + # where each lookup is composed of multiple sparse indices + for size in ln_emb: + lS_batch_lengths = [] + lS_batch_indices = [] + for _ in range(n): + # num of sparse indices to be used per embedding (between + if num_indices_per_lookup_fixed: + sparse_group_size = np.int32(num_indices_per_lookup) + else: + # random between [1,num_indices_per_lookup]) + r = ra.random(1) + sparse_group_size = np.int32( + max(1, np.round(r * min(size, num_indices_per_lookup))[0]) + ) + # sparse indices to be used per embedding + r = ra.random(sparse_group_size) + sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int32)) + # reset sparse_group_size in case some index duplicates were removed + sparse_group_size = np.int32(sparse_group.size) + # store lengths and indices + lS_batch_lengths += [sparse_group_size] + lS_batch_indices += sparse_group.tolist() + lS_emb_lengths.append(lS_batch_lengths) + lS_emb_indices.append(lS_batch_indices) + + return (Xt, lS_emb_lengths, lS_emb_indices) + + +# synthetic distribution (input data) +def generate_synthetic_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + trace_file, + enable_padding=False, +): + # dense feature + Xt = ra.rand(n, m_den).astype(np.float32) + + # sparse feature (sparse indices) + lS_emb_lengths = [] + lS_emb_indices = [] + # for each embedding generate a list of n lookups, + # where each lookup is composed of multiple sparse indices + for i, size in enumerate(ln_emb): + lS_batch_lengths = [] + lS_batch_indices = [] + for _ in range(n): + # num of sparse indices to be used per embedding (between + if num_indices_per_lookup_fixed: + sparse_group_size = np.int32(num_indices_per_lookup) + else: + # random between [1,num_indices_per_lookup]) + r = ra.random(1) + sparse_group_size = np.int32( + max(1, np.round(r * min(size, num_indices_per_lookup))[0]) + ) + # sparse indices to be used per embedding + file_path = trace_file + line_accesses, list_sd, cumm_sd = read_dist_from_file( + file_path.replace("j", str(i)) + ) + # debug print + # print('input') + # print(line_accesses); print(list_sd); print(cumm_sd); + # print(sparse_group_size) + # approach 1: rand + # r = trace_generate_rand( + # line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding + # ) + # approach 2: lru + r = trace_generate_lru( + line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding + ) + # WARNING: if the distribution in the file is not consistent with + # embedding table dimensions, below mod guards against out of + # range access + sparse_group = np.unique(r).astype(np.int32) + minsg = np.min(sparse_group) + maxsg = np.max(sparse_group) + if (minsg < 0) or (size <= maxsg): + print( + "WARNING: distribution is inconsistent with embedding " + + "table size (using mod to recover and continue)" + ) + sparse_group = np.mod(sparse_group, size).astype(np.int32) + # sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int32)) + # reset sparse_group_size in case some index duplicates were removed + sparse_group_size = np.int32(sparse_group.size) + # store lengths and indices + lS_batch_lengths += [sparse_group_size] + lS_batch_indices += sparse_group.tolist() + lS_emb_lengths.append(lS_batch_lengths) + lS_emb_indices.append(lS_batch_indices) + + return (Xt, lS_emb_lengths, lS_emb_indices) + + +def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False): + u = ra.rand(1) + if i < max_i: + # only generate stack distances up to the number of new references seen so far + j = bisect.bisect(cumm_val, i) - 1 + fi = cumm_dist[j] + u *= fi # shrink distribution support to exclude last values + elif enable_padding: + # WARNING: disable generation of new references (once all have been seen) + fi = cumm_dist[0] + u = (1.0 - fi) * u + fi # remap distribution support to exclude first value + + for (j, f) in enumerate(cumm_dist): + if u <= f: + return cumm_val[j] + + +# WARNING: global define, must be consistent across all synthetic functions +cache_line_size = 1 + + +def trace_generate_lru( + line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False +): + max_sd = list_sd[-1] + l = len(line_accesses) + i = 0 + ztrace = [] + for _ in range(out_trace_len): + sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding) + mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0 + # generate memory reference + if sd == 0: # new reference # + line_ref = line_accesses.pop(0) + line_accesses.append(line_ref) + mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) + i += 1 + else: # existing reference # + line_ref = line_accesses[l - sd] + mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) + line_accesses.pop(l - sd) + line_accesses.append(line_ref) + # save generated memory reference + ztrace.append(mem_ref) + + return ztrace + + +def trace_generate_rand( + line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False +): + max_sd = list_sd[-1] + l = len(line_accesses) # !!!Unique, + i = 0 + ztrace = [] + for _ in range(out_trace_len): + sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding) + mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0 + # generate memory reference + if sd == 0: # new reference # + line_ref = line_accesses.pop(0) + line_accesses.append(line_ref) + mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) + i += 1 + else: # existing reference # + line_ref = line_accesses[l - sd] + mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) + ztrace.append(mem_ref) + + return ztrace + + +def trace_profile(trace, enable_padding=False): + # number of elements in the array (assuming 1D) + # n = trace.size + + rstack = [] # S + stack_distances = [] # SDS + line_accesses = [] # L + for x in trace: + r = np.uint64(x / cache_line_size) + l = len(rstack) + try: # found # + i = rstack.index(r) + # WARNING: I believe below is the correct depth in terms of meaning of the + # algorithm, but that is not what seems to be in the paper alg. + # -1 can be subtracted if we defined the distance between + # consecutive accesses (e.g. r, r) as 0 rather than 1. + sd = l - i # - 1 + # push r to the end of stack_distances + stack_distances.insert(0, sd) + # remove r from its position and insert to the top of stack + rstack.pop(i) # rstack.remove(r) + rstack.insert(l - 1, r) + except ValueError: # not found # + sd = 0 # -1 + # push r to the end of stack_distances/line_accesses + stack_distances.insert(0, sd) + line_accesses.insert(0, r) + # push r to the top of stack + rstack.insert(l, r) + + if enable_padding: + # WARNING: notice that as the ratio between the number of samples (l) + # and cardinality (c) of a sample increases the probability of + # generating a sample gets smaller and smaller because there are + # few new samples compared to repeated samples. This means that for a + # long trace with relatively small cardinality it will take longer to + # generate all new samples and therefore obtain full distribution support + # and hence it takes longer for distribution to resemble the original. + # Therefore, we may pad the number of new samples to be on par with + # average number of samples l/c artificially. + l = len(stack_distances) + c = max(stack_distances) + padding = int(np.ceil(l / c)) + stack_distances = stack_distances + [0] * padding + + return (rstack, stack_distances, line_accesses) + + +# auxiliary read/write routines +def read_trace_from_file(file_path): + try: + with open(file_path) as f: + if args.trace_file_binary_type: + array = np.fromfile(f, dtype=np.uint64) + trace = array.astype(np.uint64).tolist() + else: + line = f.readline() + trace = list(map(lambda x: np.uint64(x), line.split(", "))) + return trace + except Exception: + print("ERROR: no input trace file has been provided") + + +def write_trace_to_file(file_path, trace): + try: + if args.trace_file_binary_type: + with open(file_path, "wb+") as f: + np.array(trace).astype(np.uint64).tofile(f) + else: + with open(file_path, "w+") as f: + s = str(trace) + f.write(s[1 : len(s) - 1]) + except Exception: + print("ERROR: no output trace file has been provided") + + +def read_dist_from_file(file_path): + try: + with open(file_path, "r") as f: + lines = f.read().splitlines() + except Exception: + print("Wrong file or file path") + # read unique accesses + unique_accesses = [int(el) for el in lines[0].split(", ")] + # read cumulative distribution (elements are passed as two separate lists) + list_sd = [int(el) for el in lines[1].split(", ")] + cumm_sd = [float(el) for el in lines[2].split(", ")] + + return unique_accesses, list_sd, cumm_sd + + +def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd): + try: + with open(file_path, "w") as f: + # unique_acesses + s = str(unique_accesses) + f.write(s[1 : len(s) - 1] + "\n") + # list_sd + s = str(list_sd) + f.write(s[1 : len(s) - 1] + "\n") + # cumm_sd + s = str(cumm_sd) + f.write(s[1 : len(s) - 1] + "\n") + except Exception: + print("Wrong file or file path") + + +if __name__ == "__main__": + import sys + import operator + import argparse + + ### parse arguments ### + parser = argparse.ArgumentParser(description="Generate Synthetic Distributions") + parser.add_argument("--trace-file", type=str, default="./input/trace.log") + parser.add_argument("--trace-file-binary-type", type=bool, default=False) + parser.add_argument("--trace-enable-padding", type=bool, default=False) + parser.add_argument("--dist-file", type=str, default="./input/dist.log") + parser.add_argument( + "--synthetic-file", type=str, default="./input/trace_synthetic.log" + ) + parser.add_argument("--numpy-rand-seed", type=int, default=123) + parser.add_argument("--print-precision", type=int, default=5) + args = parser.parse_args() + + ### some basic setup ### + np.random.seed(args.numpy_rand_seed) + np.set_printoptions(precision=args.print_precision) + + ### read trace ### + trace = read_trace_from_file(args.trace_file) + # print(trace) + + ### profile trace ### + (_, stack_distances, line_accesses) = trace_profile( + trace, args.trace_enable_padding + ) + stack_distances.reverse() + line_accesses.reverse() + # print(line_accesses) + # print(stack_distances) + + ### compute probability distribution ### + # count items + l = len(stack_distances) + dc = sorted( + collections.Counter(stack_distances).items(), key=operator.itemgetter(0) + ) + + # create a distribution + list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc)) # x = tuple_x_k[0] + dist_sd = list( + map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc) + ) # k = tuple_x_k[1] + cumm_sd = [] # np.cumsum(dc).tolist() #prefixsum + for i, (_, k) in enumerate(dc): + if i == 0: + cumm_sd.append(k / float(l)) + else: + # add the 2nd element of the i-th tuple in the dist_sd list + cumm_sd.append(cumm_sd[i - 1] + (k / float(l))) + + ### write stack_distance and line_accesses to a file ### + write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd) + + ### generate correspondinf synthetic ### + # line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file) + synthetic_trace = trace_generate_lru( + line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding + ) + # synthetic_trace = trace_generate_rand( + # line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding + # ) + write_trace_to_file(args.synthetic_file, synthetic_trace) diff --git a/benchmarks/dlrm/ootb/dlrm_data_pytorch.py b/benchmarks/dlrm/ootb/dlrm_data_pytorch.py new file mode 100644 index 0000000..9c4fa89 --- /dev/null +++ b/benchmarks/dlrm/ootb/dlrm_data_pytorch.py @@ -0,0 +1,1309 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: generate inputs and targets for the dlrm benchmark +# The inpts and outputs are generated according to the following three option(s) +# 1) random distribution +# 2) synthetic distribution, based on unique accesses and distances between them +# i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven +# Simulation of Cache Memory", IEEE AINAM'07 +# 3) public data set +# i) Criteo Kaggle Display Advertising Challenge Dataset +# https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset +# ii) Criteo Terabyte Dataset +# https://labs.criteo.com/2013/12/download-terabyte-click-logs + + +from __future__ import absolute_import, division, print_function, unicode_literals + +# others +from os import path +import sys +import functools +import bisect +import collections + +import data_utils + +# numpy +import numpy as np +from numpy import random as ra +from collections import deque + + +# pytorch +import torch +from torch.utils.data import Dataset, RandomSampler + +import data_loader_terabyte +import mlperf_logger + + +# Kaggle Display Advertising Challenge Dataset +# dataset (str): name of dataset (Kaggle or Terabyte) +# randomize (str): determines randomization scheme +# "none": no randomization +# "day": randomizes each day"s data (only works if split = True) +# "total": randomizes total dataset +# split (bool) : to split into train, test, validation data-sets +class CriteoDataset(Dataset): + + def __init__( + self, + dataset, + max_ind_range, + sub_sample_rate, + randomize, + split="train", + raw_path="", + pro_data="", + memory_map=False, + dataset_multiprocessing=False, + ): + # dataset + # tar_fea = 1 # single target + den_fea = 13 # 13 dense features + # spa_fea = 26 # 26 sparse features + # tad_fea = tar_fea + den_fea + # tot_fea = tad_fea + spa_fea + if dataset == "kaggle": + days = 7 + out_file = "kaggleAdDisplayChallenge_processed" + elif dataset == "terabyte": + days = 24 + out_file = "terabyte_processed" + else: + raise(ValueError("Data set option is not supported")) + self.max_ind_range = max_ind_range + self.memory_map = memory_map + + # split the datafile into path and filename + lstr = raw_path.split("/") + self.d_path = "/".join(lstr[0:-1]) + "/" + self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] + self.npzfile = self.d_path + ( + (self.d_file + "_day") if dataset == "kaggle" else self.d_file + ) + self.trafile = self.d_path + ( + (self.d_file + "_fea") if dataset == "kaggle" else "fea" + ) + + # check if pre-processed data is available + data_ready = True + if memory_map: + for i in range(days): + reo_data = self.npzfile + "_{0}_reordered.npz".format(i) + if not path.exists(str(reo_data)): + data_ready = False + else: + if not path.exists(str(pro_data)): + data_ready = False + + # pre-process data if needed + # WARNNING: when memory mapping is used we get a collection of files + if data_ready: + print("Reading pre-processed data=%s" % (str(pro_data))) + file = str(pro_data) + else: + print("Reading raw data=%s" % (str(raw_path))) + file = data_utils.getCriteoAdData( + raw_path, + out_file, + max_ind_range, + sub_sample_rate, + days, + split, + randomize, + dataset == "kaggle", + memory_map, + dataset_multiprocessing, + ) + + # get a number of samples per day + total_file = self.d_path + self.d_file + "_day_count.npz" + with np.load(total_file) as data: + total_per_file = data["total_per_file"] + # compute offsets per file + self.offset_per_file = np.array([0] + [x for x in total_per_file]) + for i in range(days): + self.offset_per_file[i + 1] += self.offset_per_file[i] + # print(self.offset_per_file) + + # setup data + if memory_map: + # setup the training/testing split + self.split = split + if split == 'none' or split == 'train': + self.day = 0 + self.max_day_range = days if split == 'none' else days - 1 + elif split == 'test' or split == 'val': + self.day = days - 1 + num_samples = self.offset_per_file[days] - \ + self.offset_per_file[days - 1] + self.test_size = int(np.ceil(num_samples / 2.)) + self.val_size = num_samples - self.test_size + else: + sys.exit("ERROR: dataset split is neither none, nor train or test.") + + ''' + # text + print("text") + for i in range(days): + fi = self.npzfile + "_{0}".format(i) + with open(fi) as data: + ttt = 0; nnn = 0 + for _j, line in enumerate(data): + ttt +=1 + if np.int32(line[0]) > 0: + nnn +=1 + print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") + # processed + print("processed") + for i in range(days): + fi = self.npzfile + "_{0}_processed.npz".format(i) + with np.load(fi) as data: + yyy = data["y"] + ttt = len(yyy) + nnn = np.count_nonzero(yyy) + print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") + # reordered + print("reordered") + for i in range(days): + fi = self.npzfile + "_{0}_reordered.npz".format(i) + with np.load(fi) as data: + yyy = data["y"] + ttt = len(yyy) + nnn = np.count_nonzero(yyy) + print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") + ''' + + # load unique counts + with np.load(self.d_path + self.d_file + "_fea_count.npz") as data: + self.counts = data["counts"] + self.m_den = den_fea # X_int.shape[1] + self.n_emb = len(self.counts) + print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den)) + + # Load the test data + # Only a single day is used for testing + if self.split == 'test' or self.split == 'val': + # only a single day is used for testing + fi = self.npzfile + "_{0}_reordered.npz".format( + self.day + ) + with np.load(fi) as data: + self.X_int = data["X_int"] # continuous feature + self.X_cat = data["X_cat"] # categorical feature + self.y = data["y"] # target + + else: + # load and preprocess data + with np.load(file) as data: + X_int = data["X_int"] # continuous feature + X_cat = data["X_cat"] # categorical feature + y = data["y"] # target + self.counts = data["counts"] + self.m_den = X_int.shape[1] # den_fea + self.n_emb = len(self.counts) + print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den)) + + # create reordering + indices = np.arange(len(y)) + + if split == "none": + # randomize all data + if randomize == "total": + indices = np.random.permutation(indices) + print("Randomized indices...") + + X_int[indices] = X_int + X_cat[indices] = X_cat + y[indices] = y + + else: + indices = np.array_split(indices, self.offset_per_file[1:-1]) + + # randomize train data (per day) + if randomize == "day": # or randomize == "total": + for i in range(len(indices) - 1): + indices[i] = np.random.permutation(indices[i]) + print("Randomized indices per day ...") + + train_indices = np.concatenate(indices[:-1]) + test_indices = indices[-1] + test_indices, val_indices = np.array_split(test_indices, 2) + + print("Defined %s indices..." % (split)) + + # randomize train data (across days) + if randomize == "total": + train_indices = np.random.permutation(train_indices) + print("Randomized indices across days ...") + + # create training, validation, and test sets + if split == 'train': + self.X_int = [X_int[i] for i in train_indices] + self.X_cat = [X_cat[i] for i in train_indices] + self.y = [y[i] for i in train_indices] + elif split == 'val': + self.X_int = [X_int[i] for i in val_indices] + self.X_cat = [X_cat[i] for i in val_indices] + self.y = [y[i] for i in val_indices] + elif split == 'test': + self.X_int = [X_int[i] for i in test_indices] + self.X_cat = [X_cat[i] for i in test_indices] + self.y = [y[i] for i in test_indices] + + print("Split data according to indices...") + + def __getitem__(self, index): + + if isinstance(index, slice): + return [ + self[idx] for idx in range( + index.start or 0, index.stop or len(self), index.step or 1 + ) + ] + + if self.memory_map: + if self.split == 'none' or self.split == 'train': + # check if need to swicth to next day and load data + if index == self.offset_per_file[self.day]: + # print("day_boundary switch", index) + self.day_boundary = self.offset_per_file[self.day] + fi = self.npzfile + "_{0}_reordered.npz".format( + self.day + ) + # print('Loading file: ', fi) + with np.load(fi) as data: + self.X_int = data["X_int"] # continuous feature + self.X_cat = data["X_cat"] # categorical feature + self.y = data["y"] # target + self.day = (self.day + 1) % self.max_day_range + + i = index - self.day_boundary + elif self.split == 'test' or self.split == 'val': + # only a single day is used for testing + i = index + (0 if self.split == 'test' else self.test_size) + else: + sys.exit("ERROR: dataset split is neither none, nor train or test.") + else: + i = index + + if self.max_ind_range > 0: + return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i] + else: + return self.X_int[i], self.X_cat[i], self.y[i] + + def _default_preprocess(self, X_int, X_cat, y): + X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1) + if self.max_ind_range > 0: + X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long) + else: + X_cat = torch.tensor(X_cat, dtype=torch.long) + y = torch.tensor(y.astype(np.float32)) + + return X_int, X_cat, y + + def __len__(self): + if self.memory_map: + if self.split == 'none': + return self.offset_per_file[-1] + elif self.split == 'train': + return self.offset_per_file[-2] + elif self.split == 'test': + return self.test_size + elif self.split == 'val': + return self.val_size + else: + sys.exit("ERROR: dataset split is neither none, nor train nor test.") + else: + return len(self.y) + + +def collate_wrapper_criteo_offset(list_of_tuples): + # where each tuple is (X_int, X_cat, y) + transposed_data = list(zip(*list_of_tuples)) + X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1) + X_cat = torch.tensor(transposed_data[1], dtype=torch.long) + T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1) + + batchSize = X_cat.shape[0] + featureCnt = X_cat.shape[1] + + lS_i = [X_cat[:, i] for i in range(featureCnt)] + lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)] + + return X_int, torch.stack(lS_o), torch.stack(lS_i), T + + +def ensure_dataset_preprocessed(args, d_path): + _ = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing + ) + + _ = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "test", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing + ) + + for split in ['train', 'val', 'test']: + print('Running preprocessing for split =', split) + + train_files = ['{}_{}_reordered.npz'.format(args.raw_data_file, day) + for + day in range(0, 23)] + + test_valid_file = args.raw_data_file + '_23_reordered.npz' + + output_file = d_path + '_{}.bin'.format(split) + + input_files = train_files if split == 'train' else [test_valid_file] + data_loader_terabyte.numpy_to_binary(input_files=input_files, + output_file_path=output_file, + split=split) + + +# Conversion from offset to length +def offset_to_length_converter(lS_o, lS_i): + def diff(tensor): + return tensor[1:] - tensor[:-1] + + return torch.stack( + [ + diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int()) + for ind, S_o in enumerate(lS_o) + ] + ) + + +def collate_wrapper_criteo_length(list_of_tuples): + # where each tuple is (X_int, X_cat, y) + transposed_data = list(zip(*list_of_tuples)) + X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1) + X_cat = torch.tensor(transposed_data[1], dtype=torch.long) + T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1) + + batchSize = X_cat.shape[0] + featureCnt = X_cat.shape[1] + + lS_i = torch.stack([X_cat[:, i] for i in range(featureCnt)]) + lS_o = torch.stack( + [torch.tensor(range(batchSize)) for _ in range(featureCnt)] + ) + + lS_l = offset_to_length_converter(lS_o, lS_i) + + return X_int, lS_l, lS_i, T + + +def make_criteo_data_and_loaders(args, offset_to_length_converter=False): + if args.mlperf_logging and args.memory_map and args.data_set == "terabyte": + # more efficient for larger batches + data_directory = path.dirname(args.raw_data_file) + + if args.mlperf_bin_loader: + lstr = args.processed_data_file.split("/") + d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0] + train_file = d_path + "_train.bin" + test_file = d_path + "_test.bin" + # val_file = d_path + "_val.bin" + counts_file = args.raw_data_file + '_fea_count.npz' + + if any(not path.exists(p) for p in [train_file, + test_file, + counts_file]): + ensure_dataset_preprocessed(args, d_path) + + train_data = data_loader_terabyte.CriteoBinDataset( + data_file=train_file, + counts_file=counts_file, + batch_size=args.mini_batch_size, + max_ind_range=args.max_ind_range + ) + + mlperf_logger.log_event(key=mlperf_logger.constants.TRAIN_SAMPLES, + value=train_data.num_samples) + + train_loader = torch.utils.data.DataLoader( + train_data, + batch_size=None, + batch_sampler=None, + shuffle=False, + num_workers=0, + collate_fn=None, + pin_memory=False, + drop_last=False, + sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None + ) + + test_data = data_loader_terabyte.CriteoBinDataset( + data_file=test_file, + counts_file=counts_file, + batch_size=args.test_mini_batch_size, + max_ind_range=args.max_ind_range + ) + + mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_SAMPLES, + value=test_data.num_samples) + + test_loader = torch.utils.data.DataLoader( + test_data, + batch_size=None, + batch_sampler=None, + shuffle=False, + num_workers=0, + collate_fn=None, + pin_memory=False, + drop_last=False, + ) + else: + data_filename = args.raw_data_file.split("/")[-1] + + train_data = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing + ) + + test_data = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "test", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing + ) + + train_loader = data_loader_terabyte.DataLoader( + data_directory=data_directory, + data_filename=data_filename, + days=list(range(23)), + batch_size=args.mini_batch_size, + max_ind_range=args.max_ind_range, + split="train" + ) + + test_loader = data_loader_terabyte.DataLoader( + data_directory=data_directory, + data_filename=data_filename, + days=[23], + batch_size=args.test_mini_batch_size, + max_ind_range=args.max_ind_range, + split="test" + ) + else: + train_data = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "train", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing, + ) + + test_data = CriteoDataset( + args.data_set, + args.max_ind_range, + args.data_sub_sample_rate, + args.data_randomize, + "test", + args.raw_data_file, + args.processed_data_file, + args.memory_map, + args.dataset_multiprocessing, + ) + + collate_wrapper_criteo = collate_wrapper_criteo_offset + if offset_to_length_converter: + collate_wrapper_criteo = collate_wrapper_criteo_length + + train_loader = torch.utils.data.DataLoader( + train_data, + batch_size=args.mini_batch_size, + shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_wrapper_criteo, + pin_memory=False, + drop_last=False, # True + ) + + test_loader = torch.utils.data.DataLoader( + test_data, + batch_size=args.test_mini_batch_size, + shuffle=False, + num_workers=args.test_num_workers, + collate_fn=collate_wrapper_criteo, + pin_memory=False, + drop_last=False, # True + ) + + return train_data, train_loader, test_data, test_loader + + +# uniform ditribution (input data) +class RandomDataset(Dataset): + + def __init__( + self, + m_den, + ln_emb, + data_size, + num_batches, + mini_batch_size, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + num_targets=1, + round_targets=False, + data_generation="random", + trace_file="", + enable_padding=False, + reset_seed_on_access=False, + rand_data_dist="uniform", + rand_data_min=1, + rand_data_max=1, + rand_data_mu=-1, + rand_data_sigma=1, + rand_seed=0, + cache_size=None, + ): + # compute batch size + nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size)) + if num_batches != 0: + nbatches = num_batches + data_size = nbatches * mini_batch_size + # print("Total number of batches %d" % nbatches) + + # save args (recompute data_size if needed) + self.m_den = m_den + self.ln_emb = ln_emb + self.data_size = data_size + self.num_batches = nbatches + self.mini_batch_size = mini_batch_size + self.num_indices_per_lookup = num_indices_per_lookup + self.num_indices_per_lookup_fixed = num_indices_per_lookup_fixed + self.num_targets = num_targets + self.round_targets = round_targets + self.data_generation = data_generation + self.trace_file = trace_file + self.enable_padding = enable_padding + self.reset_seed_on_access = reset_seed_on_access + self.rand_seed = rand_seed + self.rand_data_dist = rand_data_dist + self.rand_data_min = rand_data_min + self.rand_data_max = rand_data_max + self.rand_data_mu = rand_data_mu + self.rand_data_sigma = rand_data_sigma + self.cache_size = cache_size + + def reset_numpy_seed(self, numpy_rand_seed): + np.random.seed(numpy_rand_seed) + # torch.manual_seed(numpy_rand_seed) + + def __getitem__(self, index): + + if isinstance(index, slice): + return [ + self[idx] for idx in range( + index.start or 0, index.stop or len(self), index.step or 1 + ) + ] + + # WARNING: reset seed on access to first element + # (e.g. if same random samples needed across epochs) + if self.reset_seed_on_access and index == 0: + self.reset_numpy_seed(self.rand_seed) + + # number of data points in a batch + n = min(self.mini_batch_size, self.data_size - (index * self.mini_batch_size)) + + # generate a batch of dense and sparse features + if self.data_generation == "random": + if self.cache_size is None: + Gen = generate_dist_input_batch.__wrapped__ + cache_key = None + else: + Gen = generate_dist_input_batch + cache_key = index % self.cache_size + (X, lS_o, lS_i) = Gen( + self.m_den, + tuple(self.ln_emb.tolist()), + n, + self.num_indices_per_lookup, + self.num_indices_per_lookup_fixed, + rand_data_dist=self.rand_data_dist, + rand_data_min=self.rand_data_min, + rand_data_max=self.rand_data_max, + rand_data_mu=self.rand_data_mu, + rand_data_sigma=self.rand_data_sigma, + cache_key=cache_key, + ) + elif self.data_generation == "synthetic": + (X, lS_o, lS_i) = generate_synthetic_input_batch( + self.m_den, + self.ln_emb, + n, + self.num_indices_per_lookup, + self.num_indices_per_lookup_fixed, + self.trace_file, + self.enable_padding + ) + else: + sys.exit( + "ERROR: --data-generation=" + self.data_generation + " is not supported" + ) + + # generate a batch of target (probability of a click) + if 'cache_key' in locals() and cache_key is not None: + T = generate_random_output_batch(n, self.num_targets, self.round_targets, cache_key) + else: + T = generate_random_output_batch.__wrapped__(n, self.num_targets, self.round_targets) + + return (X, lS_o, lS_i, T) + + def __len__(self): + # WARNING: note that we produce bacthes of outputs in __getitem__ + # therefore we should use num_batches rather than data_size below + return self.num_batches + + +def collate_wrapper_random_offset(list_of_tuples): + # where each tuple is (X, lS_o, lS_i, T) + (X, lS_o, lS_i, T) = list_of_tuples[0] + return (X, + torch.stack(lS_o), + lS_i, + T) + + +def collate_wrapper_random_length(list_of_tuples): + # where each tuple is (X, lS_o, lS_i, T) + (X, lS_o, lS_i, T) = list_of_tuples[0] + return (X, + offset_to_length_converter(torch.stack(lS_o), lS_i), + lS_i, + T) + + +def make_random_data_and_loader(args, ln_emb, m_den, + offset_to_length_converter=False, cache_size=None, +): + + train_data = RandomDataset( + m_den, + ln_emb, + args.data_size, + args.num_batches, + args.mini_batch_size, + args.num_indices_per_lookup, + args.num_indices_per_lookup_fixed, + 1, # num_targets + args.round_targets, + args.data_generation, + args.data_trace_file, + args.data_trace_enable_padding, + reset_seed_on_access=True, + rand_data_dist=args.rand_data_dist, + rand_data_min=args.rand_data_min, + rand_data_max=args.rand_data_max, + rand_data_mu=args.rand_data_mu, + rand_data_sigma=args.rand_data_sigma, + rand_seed=args.numpy_rand_seed, + cache_size=cache_size, + ) # WARNING: generates a batch of lookups at once + + test_data = RandomDataset( + m_den, + ln_emb, + args.data_size, + args.num_batches, + args.mini_batch_size, + args.num_indices_per_lookup, + args.num_indices_per_lookup_fixed, + 1, # num_targets + args.round_targets, + args.data_generation, + args.data_trace_file, + args.data_trace_enable_padding, + reset_seed_on_access=True, + rand_data_dist=args.rand_data_dist, + rand_data_min=args.rand_data_min, + rand_data_max=args.rand_data_max, + rand_data_mu=args.rand_data_mu, + rand_data_sigma=args.rand_data_sigma, + rand_seed=args.numpy_rand_seed, + cache_size=cache_size, + ) + + collate_wrapper_random = collate_wrapper_random_offset + if offset_to_length_converter: + collate_wrapper_random = collate_wrapper_random_length + + train_loader = torch.utils.data.DataLoader( + train_data, + batch_size=1, + shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_wrapper_random, + pin_memory=False, + drop_last=False, # True + ) + + test_loader = torch.utils.data.DataLoader( + test_data, + batch_size=1, + shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_wrapper_random, + pin_memory=False, + drop_last=False, # True + ) + return train_data, train_loader, test_data, test_loader + + +def generate_random_data( + m_den, + ln_emb, + data_size, + num_batches, + mini_batch_size, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + num_targets=1, + round_targets=False, + data_generation="random", + trace_file="", + enable_padding=False, + length=False, # length for caffe2 version (except dlrm_s_caffe2) +): + nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size)) + if num_batches != 0: + nbatches = num_batches + data_size = nbatches * mini_batch_size + # print("Total number of batches %d" % nbatches) + + # inputs + lT = [] + lX = [] + lS_offsets = [] + lS_indices = [] + for j in range(0, nbatches): + # number of data points in a batch + n = min(mini_batch_size, data_size - (j * mini_batch_size)) + + # generate a batch of dense and sparse features + if data_generation == "random": + (Xt, lS_emb_offsets, lS_emb_indices) = generate_uniform_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + length, + ) + elif data_generation == "synthetic": + (Xt, lS_emb_offsets, lS_emb_indices) = generate_synthetic_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + trace_file, + enable_padding + ) + else: + sys.exit( + "ERROR: --data-generation=" + data_generation + " is not supported" + ) + # dense feature + lX.append(Xt) + # sparse feature (sparse indices) + lS_offsets.append(lS_emb_offsets) + lS_indices.append(lS_emb_indices) + + # generate a batch of target (probability of a click) + P = generate_random_output_batch(n, num_targets, round_targets) + lT.append(P) + + return (nbatches, lX, lS_offsets, lS_indices, lT) + + +@functools.lru_cache(maxsize=None) +def generate_random_output_batch(n, num_targets, round_targets=False, cache_key=None): + # target (probability of a click) + if round_targets: + P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.float32) + else: + P = ra.rand(n, num_targets).astype(np.float32) + + return torch.tensor(P) + + +# uniform ditribution (input data) +def generate_uniform_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + length, +): + # dense feature + Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32)) + + # sparse feature (sparse indices) + lS_emb_offsets = [] + lS_emb_indices = [] + # for each embedding generate a list of n lookups, + # where each lookup is composed of multiple sparse indices + for size in ln_emb: + lS_batch_offsets = [] + lS_batch_indices = [] + offset = 0 + for _ in range(n): + # num of sparse indices to be used per embedding (between + if num_indices_per_lookup_fixed: + sparse_group_size = np.int64(num_indices_per_lookup) + else: + # random between [1,num_indices_per_lookup]) + r = ra.random(1) + sparse_group_size = np.int64( + np.round(max([1.0], r * min(size, num_indices_per_lookup))) + ) + # sparse indices to be used per embedding + r = ra.random(sparse_group_size) + sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64)) + # reset sparse_group_size in case some index duplicates were removed + sparse_group_size = np.int32(sparse_group.size) + # store lengths and indices + if length: # for caffe2 version + lS_batch_offsets += [sparse_group_size] + else: + lS_batch_offsets += [offset] + lS_batch_indices += sparse_group.tolist() + # update offset for next iteration + offset += sparse_group_size + lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) + lS_emb_indices.append(torch.tensor(lS_batch_indices)) + + return (Xt, lS_emb_offsets, lS_emb_indices) + + +# random data from uniform or gaussian ditribution (input data) +@functools.lru_cache(maxsize=None) +def generate_dist_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + rand_data_dist, + rand_data_min, + rand_data_max, + rand_data_mu, + rand_data_sigma, + cache_key = None, +): + # dense feature + Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32)) + + # sparse feature (sparse indices) + lS_emb_offsets = [] + lS_emb_indices = [] + # for each embedding generate a list of n lookups, + # where each lookup is composed of multiple sparse indices + for size in ln_emb: + lS_batch_offsets = [] + lS_batch_indices = [] + offset = 0 + for _ in range(n): + # num of sparse indices to be used per embedding (between + if num_indices_per_lookup_fixed: + sparse_group_size = np.int64(num_indices_per_lookup) + else: + # random between [1,num_indices_per_lookup]) + r = ra.random(1) + sparse_group_size = np.int64( + np.round(max([1.0], r * min(size, num_indices_per_lookup))) + ) + # sparse indices to be used per embedding + if rand_data_dist == "gaussian": + if rand_data_mu == -1: + rand_data_mu = (rand_data_max + rand_data_min) / 2.0 + r = ra.normal(rand_data_mu, rand_data_sigma, sparse_group_size) + sparse_group = np.clip(r, rand_data_min, rand_data_max) + sparse_group = np.unique(sparse_group).astype(np.int64) + elif rand_data_dist == "uniform": + r = ra.random(sparse_group_size) + sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64)) + else: + raise(rand_data_dist, "distribution is not supported. \ + please select uniform or gaussian") + + # reset sparse_group_size in case some index duplicates were removed + sparse_group_size = np.int64(sparse_group.size) + # store lengths and indices + lS_batch_offsets += [offset] + lS_batch_indices += sparse_group.tolist() + # update offset for next iteration + offset += sparse_group_size + lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) + lS_emb_indices.append(torch.tensor(lS_batch_indices)) + + return (Xt, lS_emb_offsets, lS_emb_indices) + + +# synthetic distribution (input data) +def generate_synthetic_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + trace_file, + enable_padding=False, +): + # dense feature + Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32)) + + # sparse feature (sparse indices) + lS_emb_offsets = [] + lS_emb_indices = [] + # for each embedding generate a list of n lookups, + # where each lookup is composed of multiple sparse indices + for i, size in enumerate(ln_emb): + lS_batch_offsets = [] + lS_batch_indices = [] + offset = 0 + for _ in range(n): + # num of sparse indices to be used per embedding (between + if num_indices_per_lookup_fixed: + sparse_group_size = np.int64(num_indices_per_lookup) + else: + # random between [1,num_indices_per_lookup]) + r = ra.random(1) + sparse_group_size = np.int64( + max(1, np.round(r * min(size, num_indices_per_lookup))[0]) + ) + # sparse indices to be used per embedding + file_path = trace_file + line_accesses, list_sd, cumm_sd = read_dist_from_file( + file_path.replace("j", str(i)) + ) + # debug prints + # print("input") + # print(line_accesses); print(list_sd); print(cumm_sd); + # print(sparse_group_size) + # approach 1: rand + # r = trace_generate_rand( + # line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding + # ) + # approach 2: lru + r = trace_generate_lru( + line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding + ) + # WARNING: if the distribution in the file is not consistent + # with embedding table dimensions, below mod guards against out + # of range access + sparse_group = np.unique(r).astype(np.int64) + minsg = np.min(sparse_group) + maxsg = np.max(sparse_group) + if (minsg < 0) or (size <= maxsg): + print( + "WARNING: distribution is inconsistent with embedding " + + "table size (using mod to recover and continue)" + ) + sparse_group = np.mod(sparse_group, size).astype(np.int64) + # sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int64)) + # reset sparse_group_size in case some index duplicates were removed + sparse_group_size = np.int64(sparse_group.size) + # store lengths and indices + lS_batch_offsets += [offset] + lS_batch_indices += sparse_group.tolist() + # update offset for next iteration + offset += sparse_group_size + lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) + lS_emb_indices.append(torch.tensor(lS_batch_indices)) + + return (Xt, lS_emb_offsets, lS_emb_indices) + + +def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False): + u = ra.rand(1) + if i < max_i: + # only generate stack distances up to the number of new references seen so far + j = bisect.bisect(cumm_val, i) - 1 + fi = cumm_dist[j] + u *= fi # shrink distribution support to exclude last values + elif enable_padding: + # WARNING: disable generation of new references (once all have been seen) + fi = cumm_dist[0] + u = (1.0 - fi) * u + fi # remap distribution support to exclude first value + + for (j, f) in enumerate(cumm_dist): + if u <= f: + return cumm_val[j] + + +# WARNING: global define, must be consistent across all synthetic functions +cache_line_size = 1 + + +def trace_generate_lru( + line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False +): + max_sd = list_sd[-1] + l = len(line_accesses) + i = 0 + ztrace = deque() + for _ in range(out_trace_len): + sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding) + mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0 + + # generate memory reference + if sd == 0: # new reference # + line_ref = line_accesses[0] + del line_accesses[0] + line_accesses.append(line_ref) + mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) + i += 1 + else: # existing reference # + line_ref = line_accesses[l - sd] + mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) + del line_accesses[l - sd] + line_accesses.append(line_ref) + # save generated memory reference + ztrace.append(mem_ref) + + return ztrace + + +def trace_generate_rand( + line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False +): + max_sd = list_sd[-1] + l = len(line_accesses) # !!!Unique, + i = 0 + ztrace = [] + for _ in range(out_trace_len): + sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding) + mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0 + # generate memory reference + if sd == 0: # new reference # + line_ref = line_accesses.pop(0) + line_accesses.append(line_ref) + mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) + i += 1 + else: # existing reference # + line_ref = line_accesses[l - sd] + mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) + ztrace.append(mem_ref) + + return ztrace + + +def trace_profile(trace, enable_padding=False): + # number of elements in the array (assuming 1D) + # n = trace.size + + rstack = deque() # S + stack_distances = deque() # SDS + line_accesses = deque() # L + for x in trace: + r = np.uint64(x / cache_line_size) + l = len(rstack) + try: # found # + i = rstack.index(r) + # WARNING: I believe below is the correct depth in terms of meaning of the + # algorithm, but that is not what seems to be in the paper alg. + # -1 can be subtracted if we defined the distance between + # consecutive accesses (e.g. r, r) as 0 rather than 1. + sd = l - i # - 1 + # push r to the end of stack_distances + stack_distances.appendleft(sd) + # remove r from its position and insert to the top of stack + del rstack[i] # rstack.remove(r) + rstack.append(r) + except ValueError: # not found # + sd = 0 # -1 + # push r to the end of stack_distances/line_accesses + stack_distances.appendleft(sd) + line_accesses.appendleft(r) + # push r to the top of stack + rstack.append(r) + + if enable_padding: + # WARNING: notice that as the ratio between the number of samples (l) + # and cardinality (c) of a sample increases the probability of + # generating a sample gets smaller and smaller because there are + # few new samples compared to repeated samples. This means that for a + # long trace with relatively small cardinality it will take longer to + # generate all new samples and therefore obtain full distribution support + # and hence it takes longer for distribution to resemble the original. + # Therefore, we may pad the number of new samples to be on par with + # average number of samples l/c artificially. + l = len(stack_distances) + c = max(stack_distances) + padding = int(np.ceil(l / c)) + stack_distances = stack_distances + [0] * padding + + return (rstack, stack_distances, line_accesses) + + +# auxiliary read/write routines +def read_trace_from_file(file_path): + try: + with open(file_path) as f: + if args.trace_file_binary_type: + array = np.fromfile(f, dtype=np.uint64) + trace = array.astype(np.uint64).tolist() + else: + line = f.readline() + trace = list(map(lambda x: np.uint64(x), line.split(", "))) + return trace + except Exception: + print(f"ERROR: trace file '{file_path}' is not available.") + + +def write_trace_to_file(file_path, trace): + try: + if args.trace_file_binary_type: + with open(file_path, "wb+") as f: + np.array(trace).astype(np.uint64).tofile(f) + else: + with open(file_path, "w+") as f: + s = str(list(trace)) + f.write(s[1 : len(s) - 1]) + except Exception: + print("ERROR: no output trace file has been provided") + + +def read_dist_from_file(file_path): + try: + with open(file_path, "r") as f: + lines = f.read().splitlines() + except Exception: + print("{file_path} Wrong file or file path") + # read unique accesses + unique_accesses = [int(el) for el in lines[0].split(", ")] + # read cumulative distribution (elements are passed as two separate lists) + list_sd = [int(el) for el in lines[1].split(", ")] + cumm_sd = [float(el) for el in lines[2].split(", ")] + + return unique_accesses, list_sd, cumm_sd + + +def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd): + try: + with open(file_path, "w") as f: + # unique_acesses + s = str(list(unique_accesses)) + f.write(s[1 : len(s) - 1] + "\n") + # list_sd + s = str(list_sd) + f.write(s[1 : len(s) - 1] + "\n") + # cumm_sd + s = str(list(cumm_sd)) + f.write(s[1 : len(s) - 1] + "\n") + except Exception: + print("Wrong file or file path") + + +if __name__ == "__main__": + import operator + import argparse + + ### parse arguments ### + parser = argparse.ArgumentParser(description="Generate Synthetic Distributions") + parser.add_argument("--trace-file", type=str, default="./input/trace.log") + parser.add_argument("--trace-file-binary-type", type=bool, default=False) + parser.add_argument("--trace-enable-padding", type=bool, default=False) + parser.add_argument("--dist-file", type=str, default="./input/dist.log") + parser.add_argument( + "--synthetic-file", type=str, default="./input/trace_synthetic.log" + ) + parser.add_argument("--numpy-rand-seed", type=int, default=123) + parser.add_argument("--print-precision", type=int, default=5) + args = parser.parse_args() + + ### some basic setup ### + np.random.seed(args.numpy_rand_seed) + np.set_printoptions(precision=args.print_precision) + + ### read trace ### + trace = read_trace_from_file(args.trace_file) + # print(trace) + + ### profile trace ### + (_, stack_distances, line_accesses) = trace_profile( + trace, args.trace_enable_padding + ) + stack_distances.reverse() + line_accesses.reverse() + # print(line_accesses) + # print(stack_distances) + + ### compute probability distribution ### + # count items + l = len(stack_distances) + dc = sorted( + collections.Counter(stack_distances).items(), key=operator.itemgetter(0) + ) + + # create a distribution + list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc)) # x = tuple_x_k[0] + dist_sd = list( + map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc) + ) # k = tuple_x_k[1] + cumm_sd = deque() # np.cumsum(dc).tolist() #prefixsum + for i, (_, k) in enumerate(dc): + if i == 0: + cumm_sd.append(k / float(l)) + else: + # add the 2nd element of the i-th tuple in the dist_sd list + cumm_sd.append(cumm_sd[i - 1] + (k / float(l))) + + ### write stack_distance and line_accesses to a file ### + write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd) + + ### generate corresponding synthetic ### + # line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file) + synthetic_trace = trace_generate_lru( + line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding + ) + # synthetic_trace = trace_generate_rand( + # line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding + # ) + write_trace_to_file(args.synthetic_file, synthetic_trace) diff --git a/benchmarks/dlrm/ootb/dlrm_s_caffe2.py b/benchmarks/dlrm/ootb/dlrm_s_caffe2.py new file mode 100644 index 0000000..8e3ed74 --- /dev/null +++ b/benchmarks/dlrm/ootb/dlrm_s_caffe2.py @@ -0,0 +1,1703 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: an implementation of a deep learning recommendation model (DLRM) +# The model input consists of dense and sparse features. The former is a vector +# of floating point values. The latter is a list of sparse indices into +# embedding tables, which consist of vectors of floating point values. +# The selected vectors are passed to mlp networks denoted by triangles, +# in some cases the vectors are interacted through operators (Ops). +# +# output: +# vector of values +# model: | +# /\ +# /__\ +# | +# _____________________> Op <___________________ +# / | \ +# /\ /\ /\ +# /__\ /__\ ... /__\ +# | | | +# | Op Op +# | ____/__\_____ ____/__\____ +# | |_Emb_|____|__| ... |_Emb_|__|___| +# input: +# [ dense features ] [sparse indices] , ..., [sparse indices] +# +# More precise definition of model layers: +# 1) fully connected layers of an mlp +# z = f(y) +# y = Wx + b +# +# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk]) +# z = Op(e1,...,ek) +# obtain vectors e1=E[:,p1], ..., ek=E[:,pk] +# +# 3) Operator Op can be one of the following +# Sum(e1,...,ek) = e1 + ... + ek +# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek] +# Cat(e1,...,ek) = [e1', ..., ek']' +# where ' denotes transpose operation +# +# References: +# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang, +# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, +# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii, +# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko, +# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong, +# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and +# Recommendation Systems", CoRR, arXiv:1906.00091, 2019 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import functools + +# others +import operator +import time +import copy + +# data generation +import dlrm_data_pytorch as dp + +# numpy +import numpy as np +import sklearn.metrics + +# onnx +# The onnx import causes deprecation warnings every time workers +# are spawned during testing. So, we filter out those warnings. +import warnings +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + try: + import onnx + import caffe2.python.onnx.frontend + except ImportError as error: + print('Unable to import onnx or caffe2.python.onnx.frontend ', error) + +# from caffe2.python import data_parallel_model + +# caffe2 +from caffe2.proto import caffe2_pb2 +from caffe2.python import brew, core, dyndep, model_helper, net_drawer, workspace + +""" +# auxiliary routine used to split input on the mini-bacth dimension +def where_to_split(mini_batch_size, ndevices, _add_leftover=False): + n = (mini_batch_size + ndevices - 1) // ndevices # ceiling + l = mini_batch_size - n * (ndevices - 1) # leftover + s = [n] * (ndevices - 1) + if _add_leftover: + ls += [l if l > 0 else n] + return ls +""" + + +### define dlrm in Caffe2 ### +class DLRM_Net(object): + def FeedBlobWrapper(self, tag, val, add_prefix=True, split=False, device_id=-1): + if self.ndevices > 1 and add_prefix: + if split: + # split across devices + mini_batch_size = val.shape[0] + # approach 1: np and caffe2 operators assume the mini-batch size is + # divisible exactly by the number of available devices + if mini_batch_size % self.ndevices != 0: + sys.exit("ERROR: caffe2 net assumes that the mini_batch_size " + + str(mini_batch_size) + + " is evenly divisible by the number of available devices" + + str(self.ndevices)) + vals = np.split(val, self.ndevices, axis=0) + """ + # approach 2: np and caffe2 operators do not assume exact divisibility + if args.mini_batch_size != mini_batch_size: + sys.exit("ERROR: caffe2 net was prepared for mini-batch size " + + str(args.mini_batch_size) + + " which is different from current mini-batch size " + + str(mini_batch_size) + " being passed to it. " + + "This is common for the last mini-batch, when " + + "mini-batch size does not evenly divided the number of " + + "elements in the data set.") + ls = where_to_split(mini_batch_size, self.ndevices) + vals = np.split(val, ls, axis=0) + """ + # feed to multiple devices + for d in range(self.ndevices): + tag_on_device = "gpu_" + str(d) + "/" + tag + _d = core.DeviceOption(workspace.GpuDeviceType, d) + workspace.FeedBlob(tag_on_device, vals[d], device_option=_d) + else: + # feed to multiple devices + for d in range(self.ndevices): + tag_on_device = "gpu_" + str(d) + "/" + tag + _d = core.DeviceOption(workspace.GpuDeviceType, d) + workspace.FeedBlob(tag_on_device, val, device_option=_d) + else: + # feed to a single device (named or not) + if device_id >= 0: + _d = core.DeviceOption(workspace.GpuDeviceType, device_id) + workspace.FeedBlob(tag, val, device_option=_d) + else: + workspace.FeedBlob(tag, val) + + def FetchBlobWrapper(self, tag, add_prefix=True, reduce_across=None, device_id=-1): + if self.ndevices > 1 and add_prefix: + # fetch from multiple devices + vals = [] + for d in range(self.ndevices): + if tag.__class__ == list: + tag_on_device = tag[d] + else: + tag_on_device = "gpu_" + str(0) + "/" + tag + val = workspace.FetchBlob(tag_on_device) + vals.append(val) + # reduce across devices + if reduce_across == "add": + return functools.reduce(operator.add, vals) + elif reduce_across == "concat": + return np.concatenate(vals) + else: + return vals + else: + # fetch from a single device (named or not) + if device_id >= 0: + tag_on_device = "gpu_" + str(device_id) + "/" + tag + return workspace.FetchBlob(tag_on_device) + else: + return workspace.FetchBlob(tag) + + def AddLayerWrapper(self, layer, inp_blobs, out_blobs, + add_prefix=True, reset_grad=False, **kwargs): + # auxiliary routine to adjust tags + def adjust_tag(blobs, on_device): + if blobs.__class__ == str: + _blobs = on_device + blobs + elif blobs.__class__ == list: + _blobs = list(map(lambda tag: on_device + tag, blobs)) + else: # blobs.__class__ == model_helper.ModelHelper or something else + _blobs = blobs + return _blobs + + if self.ndevices > 1 and add_prefix: + # add layer on multiple devices + ll = [] + for d in range(self.ndevices): + # add prefix on_device + on_device = "gpu_" + str(d) + "/" + _inp_blobs = adjust_tag(inp_blobs, on_device) + _out_blobs = adjust_tag(out_blobs, on_device) + # WARNING: reset_grad option was exlusively designed for WeightedSum + # with inp_blobs=[w, tag_one, "", lr], where "" will be replaced + if reset_grad: + w_grad = self.gradientMap[_inp_blobs[0]] + _inp_blobs[2] = w_grad + # add layer to the model + with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)): + if kwargs: + new_layer = layer(_inp_blobs, _out_blobs, **kwargs) + else: + new_layer = layer(_inp_blobs, _out_blobs) + ll.append(new_layer) + return ll + else: + # add layer on a single device + # WARNING: reset_grad option was exlusively designed for WeightedSum + # with inp_blobs=[w, tag_one, "", lr], where "" will be replaced + if reset_grad: + w_grad = self.gradientMap[inp_blobs[0]] + inp_blobs[2] = w_grad + # add layer to the model + if kwargs: + new_layer = layer(inp_blobs, out_blobs, **kwargs) + else: + new_layer = layer(inp_blobs, out_blobs) + return new_layer + + def create_mlp(self, ln, sigmoid_layer, model, tag): + (tag_layer, tag_in, tag_out) = tag + + # build MLP layer by layer + layers = [] + weights = [] + for i in range(1, ln.size): + n = ln[i - 1] + m = ln[i] + + # create tags + tag_fc_w = tag_layer + ":::" + "fc" + str(i) + "_w" + tag_fc_b = tag_layer + ":::" + "fc" + str(i) + "_b" + tag_fc_y = tag_layer + ":::" + "fc" + str(i) + "_y" + tag_fc_z = tag_layer + ":::" + "fc" + str(i) + "_z" + if i == ln.size - 1: + tag_fc_z = tag_out + weights.append(tag_fc_w) + weights.append(tag_fc_b) + + # initialize the weights + # approach 1: custom Xavier input, output or two-sided fill + mean = 0.0 # std_dev = np.sqrt(variance) + std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n) + W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32) + std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1)) + b = np.random.normal(mean, std_dev, size=m).astype(np.float32) + self.FeedBlobWrapper(tag_fc_w, W) + self.FeedBlobWrapper(tag_fc_b, b) + # approach 2: caffe2 xavier + # W = self.AddLayerWrapper( + # model.param_init_net.XavierFill, + # [], + # tag_fc_w, + # shape=[m, n] + # ) + # b = self.AddLayerWrapper( + # model.param_init_net.ConstantFill, + # [], + # tag_fc_b, + # shape=[m] + # ) + + # initialize the MLP's momentum for the Adagrad optimizer + if self.emb_optimizer in ["adagrad", "rwsadagrad"]: + # momentum of the weights + self.FeedBlobWrapper( + "momentum_mlp_{}_{}".format(tag_layer, 2 * i - 1), + np.full((m, n), 0, dtype=np.float32) + ) + # momentum of the biases + self.FeedBlobWrapper( + "momentum_mlp_{}_{}".format(tag_layer, 2 * i), + np.full((m), 0, dtype=np.float32) + ) + + # save the blob shapes for latter (only needed if onnx is requested) + if self.save_onnx: + self.onnx_tsd[tag_fc_w] = (onnx.TensorProto.FLOAT, W.shape) + self.onnx_tsd[tag_fc_b] = (onnx.TensorProto.FLOAT, b.shape) + + # approach 1: construct fully connected operator using model.net + fc = self.AddLayerWrapper( + model.net.FC, [tag_in, tag_fc_w, tag_fc_b], tag_fc_y + ) + # approach 2: construct fully connected operator using brew + # https://github.com/caffe2/tutorials/blob/master/MNIST.ipynb + # fc = brew.fc(model, layer, tag_fc_w, dim_in=m, dim_out=n) + layers.append(fc) + + if i == sigmoid_layer: + # approach 1: construct sigmoid operator using model.net + layer = self.AddLayerWrapper(model.net.Sigmoid, tag_fc_y, tag_fc_z) + # approach 2: using brew (which currently does not support sigmoid) + # tag_sigm = tag_layer + ":::" + "sigmoid" + str(i) + # layer = brew.sigmoid(model,fc,tag_sigmoid) + else: + # approach 1: construct relu operator using model.net + layer = self.AddLayerWrapper(model.net.Relu, tag_fc_y, tag_fc_z) + # approach 2: using brew + # tag_relu = tag_layer + ":::" + "relu" + str(i) + # layer = brew.relu(model,fc,tag_relu) + tag_in = tag_fc_z + layers.append(layer) + + # WARNING: the dependency between layers is implicit in the tags, + # so only the last layer is added to the layers list. It will + # later be used for interactions. + return layers, weights + + def create_emb(self, m, ln, model, tag): + (tag_layer, tag_in, tag_out) = tag + emb_l = [] + weights_l = [] + vw_l = [] + for i in range(0, ln.size): + n = ln[i] + + # select device + if self.ndevices > 1: + d = i % self.ndevices + else: + d = -1 + + # create tags + on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/" + len_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_l" + ind_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_i" + tbl_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_w" + sum_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_z" + weights_l.append(tbl_s) + + # initialize the weights + # approach 1a: custom + W = np.random.uniform(low=-np.sqrt(1 / n), + high=np.sqrt(1 / n), + size=(n, m)).astype(np.float32) + # approach 1b: numpy rand + # W = ra.rand(n, m).astype(np.float32) + self.FeedBlobWrapper(tbl_s, W, False, device_id=d) + # approach 2: caffe2 xavier + # with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)): + # W = model.param_init_net.XavierFill([], tbl_s, shape=[n, m]) + # save the blob shapes for latter (only needed if onnx is requested) + + # initialize the embedding's momentum for the Adagrad optimizer + if self.emb_optimizer == "adagrad": + self.FeedBlobWrapper("momentum_emb_{}".format(i), + np.full((n, m), 0), add_prefix=False, device_id=d) + elif self.emb_optimizer == "rwsadagrad": + self.FeedBlobWrapper("momentum_emb_{}".format(i), + np.full((n), 0), add_prefix=False, device_id=d) + + if self.save_onnx: + self.onnx_tsd[tbl_s] = (onnx.TensorProto.FLOAT, W.shape) + + # create operator + if self.weighted_pooling is not None: + vw_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_v" + psw_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_s" + VW = np.ones(n).astype(np.float32) + self.FeedBlobWrapper(vw_s, VW, False, device_id=d) + if self.weighted_pooling == "learned": + vw_l.append(vw_s) + grad_on_weights = True + else: + grad_on_weights = False + if self.save_onnx: + self.onnx_tsd[vw_s] = (onnx.TensorProto.FLOAT, VW.shape) + if self.ndevices <= 1: + PSW = model.net.Gather([vw_s, ind_s], [psw_s]) + EE = model.net.SparseLengthsWeightedSum( + [tbl_s, PSW, ind_s, len_s], [sum_s], + grad_on_weights=grad_on_weights + ) + else: + with core.DeviceScope( + core.DeviceOption(workspace.GpuDeviceType, d) + ): + PSW = model.net.Gather([vw_s, ind_s], [psw_s]) + EE = model.net.SparseLengthsWeightedSum( + [tbl_s, PSW, ind_s, len_s], [sum_s], + grad_on_weights=grad_on_weights + ) + else: + if self.ndevices <= 1: + EE = model.net.SparseLengthsSum( + [tbl_s, ind_s, len_s], [sum_s] + ) + else: + with core.DeviceScope( + core.DeviceOption(workspace.GpuDeviceType, d) + ): + EE = model.net.SparseLengthsSum( + [tbl_s, ind_s, len_s], [sum_s] + ) + emb_l.append(EE) + + return emb_l, weights_l, vw_l + + def create_interactions(self, x, ly, model, tag): + (tag_dense_in, tag_sparse_in, tag_int_out) = tag + + if self.arch_interaction_op == "dot": + # concatenate dense and sparse features + tag_int_out_info = tag_int_out + "_info" + T, T_info = model.net.Concat( + x + ly, + [tag_int_out + "_cat_axis0", tag_int_out_info + "_cat_axis0"], + axis=1, + add_axis=1, + ) + # perform a dot product + Z = model.net.BatchMatMul([T, T], tag_int_out + "_matmul", trans_b=1) + # append dense feature with the interactions (into a row vector) + # approach 1: all + # Zflat = model.net.Flatten(Z, tag_int_out + "_flatten", axis=1) + # approach 2: unique + Zflat_all = model.net.Flatten(Z, tag_int_out + "_flatten_all", axis=1) + Zflat = model.net.BatchGather( + [Zflat_all, tag_int_out + "_tril_indices"], + tag_int_out + "_flatten" + ) + R, R_info = model.net.Concat( + x + [Zflat], [tag_int_out, tag_int_out_info], axis=1 + ) + elif self.arch_interaction_op == "cat": + # concatenation features (into a row vector) + tag_int_out_info = tag_int_out + "_info" + R, R_info = model.net.Concat( + x + ly, [tag_int_out, tag_int_out_info], axis=1 + ) + else: + sys.exit("ERROR: --arch-interaction-op=" + + self.arch_interaction_op + " is not supported") + + return R + + def create_sequential_forward_ops(self): + # embeddings + tag = (self.temb, self.tsin, self.tsout) + self.emb_l, self.emb_w, self.emb_vw = self.create_emb( + self.m_spa, self.ln_emb, self.model, tag + ) + # bottom mlp + tag = (self.tbot, self.tdin, self.tdout) + self.bot_l, self.bot_w = self.create_mlp(self.ln_bot, self.sigmoid_bot, + self.model, tag) + # interactions + tag = (self.tdout, self.tsout, self.tint) + Z = self.create_interactions([self.bot_l[-1]], self.emb_l, self.model, tag) + + # top mlp + tag = (self.ttop, Z, self.tout) + self.top_l, self.top_w = self.create_mlp(self.ln_top, self.sigmoid_top, + self.model, tag) + # debug prints + # print(self.emb_l) + # print(self.bot_l) + # print(self.top_l) + + # setup the last output variable + self.last_output = self.top_l[-1] + + def create_parallel_forward_ops(self): + # distribute embeddings (model parallelism) + tag = (self.temb, self.tsin, self.tsout) + self.emb_l, self.emb_w, self.emb_vw = self.create_emb( + self.m_spa, self.ln_emb, self.model, tag + ) + # replicate mlp (data parallelism) + tag = (self.tbot, self.tdin, self.tdout) + self.bot_l, self.bot_w = self.create_mlp(self.ln_bot, self.sigmoid_bot, + self.model, tag) + + # add communication (butterfly shuffle) + t_list = [] + for i, emb_output in enumerate(self.emb_l): + # split input + src_d = i % self.ndevices + lo = [emb_output + "_split_" + str(d) for d in range(self.ndevices)] + # approach 1: np and caffe2 operators assume the mini-batch size is + # divisible exactly by the number of available devices + with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, src_d)): + self.model.net.Split(emb_output, lo, axis=0) + """ + # approach 2: np and caffe2 operators do not assume exact divisibility + ls = where_to_split(args.mini_batch_size, self.ndevices, _add_leftover=True) + with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, src_d)): + emb_output_split = self.model.net.Split( + emb_output, lo, split=lp, axis=0 + ) + """ + # scatter + y = [] + for dst_d in range(len(lo)): + src_blob = lo[dst_d] + dst_blob = str(src_blob).replace( + "gpu_" + str(src_d), "gpu_" + str(dst_d), 1 + ) + if src_blob != dst_blob: + with core.DeviceScope( + core.DeviceOption(workspace.GpuDeviceType, dst_d) + ): + blob = self.model.Copy(src_blob, dst_blob) + else: + blob = dst_blob + y.append(blob) + t_list.append(y) + # adjust lists to be ordered per device + x = list(map(lambda x: list(x), zip(*self.bot_l))) + ly = list(map(lambda y: list(y), zip(*t_list))) + + # interactions + for d in range(self.ndevices): + on_device = "gpu_" + str(d) + "/" + tag = (on_device + self.tdout, on_device + self.tsout, on_device + self.tint) + with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)): + self.create_interactions([x[d][-1]], ly[d], self.model, tag) + + # replicate mlp (data parallelism) + tag = (self.ttop, self.tint, self.tout) + self.top_l, self.top_w = self.create_mlp(self.ln_top, self.sigmoid_top, + self.model, tag) + + # debug prints + # print(self.model.net.Proto(),end='\n') + # sys.exit("ERROR: debugging") + + # setup the last output variable + self.last_output = self.top_l[-1] + + def __init__( + self, + m_spa, + ln_emb, + ln_bot, + ln_top, + arch_interaction_op, + arch_interaction_itself=False, + sigmoid_bot=-1, + sigmoid_top=-1, + save_onnx=False, + model=None, + test_net=None, + tag=None, + ndevices=-1, + forward_ops=True, + enable_prof=False, + weighted_pooling=None, + emb_optimizer="sgd" + ): + super(DLRM_Net, self).__init__() + + # init model + if model is None: + global_init_opt = ["caffe2", "--caffe2_log_level=0"] + if enable_prof: + global_init_opt += [ + "--logtostderr=0", + "--log_dir=$HOME", + "--caffe2_logging_print_net_summary=1", + ] + workspace.GlobalInit(global_init_opt) + self.set_tags() + self.model = model_helper.ModelHelper(name="DLRM", init_params=True) + self.test_net = None + else: + # WARNING: assume that workspace and tags have been initialized elsewhere + self.set_tags(tag[0], tag[1], tag[2], tag[3], tag[4], tag[5], tag[6], + tag[7], tag[8], tag[9]) + self.model = model + self.test_net = test_net + + # save arguments + self.m_spa = m_spa + self.ln_emb = ln_emb + self.ln_bot = ln_bot + self.ln_top = ln_top + self.arch_interaction_op = arch_interaction_op + self.arch_interaction_itself = arch_interaction_itself + self.sigmoid_bot = sigmoid_bot + self.sigmoid_top = sigmoid_top + self.save_onnx = save_onnx + self.ndevices = ndevices + self.emb_optimizer = emb_optimizer + if weighted_pooling is not None and weighted_pooling != "fixed": + self.weighted_pooling = "learned" + else: + self.weighted_pooling = weighted_pooling + # onnx types and shapes dictionary + if self.save_onnx: + self.onnx_tsd = {} + # create forward operators + if forward_ops: + if self.ndevices <= 1: + return self.create_sequential_forward_ops() + else: + return self.create_parallel_forward_ops() + + def set_tags( + self, + _tag_layer_top_mlp="top", + _tag_layer_bot_mlp="bot", + _tag_layer_embedding="emb", + _tag_feature_dense_in="dense_in", + _tag_feature_dense_out="dense_out", + _tag_feature_sparse_in="sparse_in", + _tag_feature_sparse_out="sparse_out", + _tag_interaction="interaction", + _tag_dense_output="prob_click", + _tag_dense_target="target", + ): + # layer tags + self.ttop = _tag_layer_top_mlp + self.tbot = _tag_layer_bot_mlp + self.temb = _tag_layer_embedding + # dense feature tags + self.tdin = _tag_feature_dense_in + self.tdout = _tag_feature_dense_out + # sparse feature tags + self.tsin = _tag_feature_sparse_in + self.tsout = _tag_feature_sparse_out + # output and target tags + self.tint = _tag_interaction + self.ttar = _tag_dense_target + self.tout = _tag_dense_output + + def parameters(self): + return self.model + + def get_loss(self): + return self.FetchBlobWrapper(self.loss, reduce_across="add") + + def get_output(self): + return self.FetchBlobWrapper(self.last_output, reduce_across="concat") + + def create(self, X, S_lengths, S_indices, T): + self.create_input(X, S_lengths, S_indices, T) + self.create_model(X, S_lengths, S_indices, T) + + def create_input(self, X, S_lengths, S_indices, T): + # feed input data to blobs + self.FeedBlobWrapper(self.tdin, X, split=True) + # save the blob shapes for latter (only needed if onnx is requested) + if self.save_onnx: + self.onnx_tsd[self.tdin] = (onnx.TensorProto.FLOAT, X.shape) + + for i in range(len(self.emb_l)): + # select device + if self.ndevices > 1: + d = i % self.ndevices + else: + d = -1 + # create tags + on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/" + len_s = on_device + self.temb + ":::" + "sls" + str(i) + "_l" + ind_s = on_device + self.temb + ":::" + "sls" + str(i) + "_i" + self.FeedBlobWrapper(len_s, np.array(S_lengths[i]), False, device_id=d) + self.FeedBlobWrapper(ind_s, np.array(S_indices[i]), False, device_id=d) + # save the blob shapes for latter (only needed if onnx is requested) + if self.save_onnx: + lshape = (len(S_lengths[i]),) # =args.mini_batch_size + ishape = (len(S_indices[i]),) + self.onnx_tsd[len_s] = (onnx.TensorProto.INT32, lshape) + self.onnx_tsd[ind_s] = (onnx.TensorProto.INT32, ishape) + + # feed target data to blobs + if T is not None: + zeros_fp32 = np.zeros(T.shape).astype(np.float32) + self.FeedBlobWrapper(self.ttar, zeros_fp32, split=True) + # save the blob shapes for latter (only needed if onnx is requested) + if self.save_onnx: + self.onnx_tsd[self.ttar] = (onnx.TensorProto.FLOAT, T.shape) + + def create_model(self, X, S_lengths, S_indices, T): + #setup tril indices for the interactions + offset = 1 if self.arch_interaction_itself else 0 + num_fea = len(self.emb_l) + 1 + tril_indices = np.array([j + i * num_fea + for i in range(num_fea) for j in range(i + offset)]) + self.FeedBlobWrapper(self.tint + "_tril_indices", tril_indices) + + # create compute graph + if T is not None: + # WARNING: RunNetOnce call is needed only if we use brew and ConstantFill. + # We could use direct calls to self.model functions above to avoid it + workspace.RunNetOnce(self.model.param_init_net) + workspace.CreateNet(self.model.net) + if self.test_net is not None: + workspace.CreateNet(self.test_net) + + def run(self, X, S_lengths, S_indices, T, test_net=False, enable_prof=False): + # feed input data to blobs + # dense features + self.FeedBlobWrapper(self.tdin, X, split=True) + # sparse features + for i in range(len(self.emb_l)): + # select device + if self.ndevices > 1: + d = i % self.ndevices + else: + d = -1 + # create tags + on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/" + len_s = on_device + self.temb + ":::" + "sls" + str(i) + "_l" + ind_s = on_device + self.temb + ":::" + "sls" + str(i) + "_i" + self.FeedBlobWrapper(len_s, np.array(S_lengths[i]), False, device_id=d) + self.FeedBlobWrapper(ind_s, np.array(S_indices[i]), False, device_id=d) + + # feed target data to blobs if needed + if T is not None: + self.FeedBlobWrapper(self.ttar, T, split=True) + # execute compute graph + if test_net: + workspace.RunNet(self.test_net) + else: + if enable_prof: + workspace.C.benchmark_net(self.model.net.Name(), 0, 1, True) + else: + workspace.RunNet(self.model.net) + # debug prints + # print("intermediate") + # print(self.FetchBlobWrapper(self.bot_l[-1])) + # for tag_emb in self.emb_l: + # print(self.FetchBlobWrapper(tag_emb)) + # print(self.FetchBlobWrapper(self.tint)) + + def MSEloss(self, scale=1.0): + # add MSEloss to the model + self.AddLayerWrapper(self.model.SquaredL2Distance, [self.tout, self.ttar], "sd") + self.AddLayerWrapper(self.model.Scale, "sd", "sd2", scale=2.0 * scale) + # WARNING: "loss" is a special tag and should not be changed + self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd2", "loss") + + def BCEloss(self, scale=1.0, threshold=0.0): + # add BCEloss to the mode + if 0.0 < threshold and threshold < 1.0: + self.AddLayerWrapper(self.model.Clip, self.tout, "tout_c", + min=threshold, max=(1.0 - threshold)) + self.AddLayerWrapper(self.model.MakeTwoClass, "tout_c", "tout_2c") + else: + self.AddLayerWrapper(self.model.MakeTwoClass, self.tout, "tout_2c") + self.AddLayerWrapper(self.model.LabelCrossEntropy, ["tout_2c", self.ttar], "sd") + # WARNING: "loss" is a special tag and should not be changed + if scale == 1.0: + self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd", "loss") + else: + self.AddLayerWrapper(self.model.Scale, "sd", "sd2", scale=scale) + self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd2", "loss") + + def sgd_optimizer(self, learning_rate, + T=None, _gradientMap=None, sync_dense_params=True): + # create one, it and lr tags (or use them if already present) + if T is not None: + (tag_one, tag_it, tag_lr) = T + else: + (tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr") + + # approach 1: feed values directly + # self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32)) + # self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64)) + # it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it) + # lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr, + # base_lr=-1 * learning_rate, policy="fixed") + # approach 2: use brew + self.AddLayerWrapper(self.model.param_init_net.ConstantFill, + [], tag_one, shape=[1], value=1.0) + self.AddLayerWrapper(brew.iter, self.model, tag_it) + self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr, + base_lr=-1 * learning_rate, policy="fixed") + # save the blob shapes for latter (only needed if onnx is requested) + if self.save_onnx: + self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,)) + self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,)) + + # create gradient maps (or use them if already present) + if _gradientMap is not None: + self.gradientMap = _gradientMap + else: + if self.loss.__class__ == list: + self.gradientMap = self.model.AddGradientOperators(self.loss) + else: + self.gradientMap = self.model.AddGradientOperators([self.loss]) + + # update weights + # approach 1: builtin function + # optimizer.build_sgd(self.model, base_learning_rate=learning_rate) + # approach 2: custom code + # top MLP weight and bias + for w in self.top_w: + # allreduce across devices if needed + if sync_dense_params and self.ndevices > 1: + grad_blobs = [ + self.gradientMap["gpu_{}/".format(d) + w] + for d in range(self.ndevices) + ] + self.model.NCCLAllreduce(grad_blobs, grad_blobs) + # update weights + self.AddLayerWrapper(self.model.WeightedSum, + [w, tag_one, "", tag_lr], w, reset_grad=True) + # bottom MLP weight and bias + for w in self.bot_w: + # allreduce across devices if needed + if sync_dense_params and self.ndevices > 1: + grad_blobs = [ + self.gradientMap["gpu_{}/".format(d) + w] + for d in range(self.ndevices) + ] + self.model.NCCLAllreduce(grad_blobs, grad_blobs) + # update weights + self.AddLayerWrapper(self.model.WeightedSum, + [w, tag_one, "", tag_lr], w, reset_grad=True) + # update embeddings + for i, w in enumerate(self.emb_w): + # select device + if self.ndevices > 1: + d = i % self.ndevices + # create tags + on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/" + _tag_one = on_device + tag_one + _tag_lr = on_device + tag_lr + # pickup gradient + w_grad = self.gradientMap[w] + # update weights + if self.ndevices > 1: + with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)): + self.model.ScatterWeightedSum([w, _tag_one, w_grad.indices, + w_grad.values, _tag_lr], w) + else: + self.model.ScatterWeightedSum([w, _tag_one, w_grad.indices, + w_grad.values, _tag_lr], w) + + # update per sample weights + if self.weighted_pooling == "learned": + for i, w in enumerate(self.emb_vw): + # select device + if self.ndevices > 1: + d = i % self.ndevices + # create tags + on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/" + _tag_one = on_device + tag_one + _tag_lr = on_device + tag_lr + # pickup gradient + w_grad = self.gradientMap[w] + # update weights + if self.ndevices > 1: + with core.DeviceScope( + core.DeviceOption(workspace.GpuDeviceType, d) + ): + self.model.ScatterWeightedSum( + [w, _tag_one, w_grad.indices, + w_grad.values, _tag_lr], w + ) + else: + self.model.ScatterWeightedSum( + [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w + ) + + def adagrad_optimizer(self, learning_rate, + T=None, _gradientMap=None, sync_dense_params=True, + epsilon=1e-10, decay_=0.0, weight_decay_=0.0): + # create one, it and lr tags (or use them if already present) + if T is not None: + (tag_one, tag_it, tag_lr) = T + else: + (tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr") + + # approach 1: feed values directly + # self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32)) + # self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64)) + # it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it) + # lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr, + # base_lr=-1 * learning_rate, policy="fixed") + # approach 2: use brew + self.AddLayerWrapper(self.model.param_init_net.ConstantFill, + [], tag_one, shape=[1], value=1.0) + self.AddLayerWrapper(brew.iter, self.model, tag_it) + self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr, + base_lr=-1 * learning_rate, policy="fixed") + # save the blob shapes for latter (only needed if onnx is requested) + if self.save_onnx: + self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,)) + self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,)) + + # create gradient maps (or use them if already present) + if _gradientMap is not None: + self.gradientMap = _gradientMap + else: + if self.loss.__class__ == list: + self.gradientMap = self.model.AddGradientOperators(self.loss) + else: + self.gradientMap = self.model.AddGradientOperators([self.loss]) + + # update weights + # approach 1: builtin function + # optimizer.build_sgd(self.model, base_learning_rate=learning_rate) + # approach 2: custom code + # top MLP weight and bias + for i, w in enumerate(self.top_w): + # allreduce across devices if needed + if sync_dense_params and self.ndevices > 1: + grad_blobs = [ + self.gradientMap["gpu_{}/".format(d) + w] + for d in range(self.ndevices) + ] + self.model.NCCLAllreduce(grad_blobs, grad_blobs) + # update weights + self.model.Adagrad( + [ + w, + "momentum_mlp_top_{}".format(i + 1), + self.gradientMap[w], + tag_lr + ], + [w, "momentum_mlp_top_{}".format(i + 1)], + epsilon=epsilon, + decay_=decay_, + weight_decay_=weight_decay_ + ) + + # bottom MLP weight and bias + for i, w in enumerate(self.bot_w): + # allreduce across devices if needed + if sync_dense_params and self.ndevices > 1: + grad_blobs = [ + self.gradientMap["gpu_{}/".format(d) + w] + for d in range(self.ndevices) + ] + self.model.NCCLAllreduce(grad_blobs, grad_blobs) + # update weights + self.model.Adagrad( + [ + w, + "momentum_mlp_bot_{}".format(i + 1), + self.gradientMap[w], + tag_lr + ], + [w, "momentum_mlp_bot_{}".format(i + 1)], + epsilon=epsilon, + decay_=decay_, + weight_decay_=weight_decay_ + ) + + # update embeddings + for i, w in enumerate(self.emb_w): + # select device + if self.ndevices > 1: + d = i % self.ndevices + # create tags + on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/" + _tag_one = on_device + tag_one + _tag_lr = on_device + tag_lr + # pickup gradient + w_grad = self.gradientMap[w] + # update weights + def add_optimizer(): + self.model.Unique( + w_grad.indices, + ["unique_w_grad_indices", "remapping_w_grad_indices"] + ) + self.model.UnsortedSegmentSum( + [w_grad.values, "remapping_w_grad_indices"], + "unique_w_grad_values" + ) + + if self.emb_optimizer == "adagrad": + self.model.SparseAdagrad( + [ + w, + "momentum_emb_{}".format(i), + "unique_w_grad_indices", + "unique_w_grad_values", + _tag_lr + ], + [w, "momentum_emb_{}".format(i)], + epsilon=epsilon, + decay_=decay_, + weight_decay_=weight_decay_ + ) + + elif self.emb_optimizer == "rwsadagrad": + self.model.RowWiseSparseAdagrad( + [ + w, + "momentum_emb_{}".format(i), + "unique_w_grad_indices", + "unique_w_grad_values", + _tag_lr + ], + [w, "momentum_emb_{}".format(i)], + epsilon=epsilon, + decay_=decay_, + weight_decay_=weight_decay_ + ) + + if self.ndevices > 1: + with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)): + add_optimizer() + else: + add_optimizer() + + # update per sample weights + if self.weighted_pooling == "learned": + for i, w in enumerate(self.emb_vw): + # select device + if self.ndevices > 1: + d = i % self.ndevices + # create tags + on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/" + _tag_one = on_device + tag_one + _tag_lr = on_device + tag_lr + # pickup gradient + w_grad = self.gradientMap[w] + # update weights + if self.ndevices > 1: + with core.DeviceScope( + core.DeviceOption(workspace.GpuDeviceType, d) + ): + self.model.ScatterWeightedSum( + [w, _tag_one, w_grad.indices, + w_grad.values, _tag_lr], w + ) + else: + self.model.ScatterWeightedSum( + [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w + ) + + def print_all(self): + # approach 1: all + print(workspace.Blobs(), end='\n') + for _, l in enumerate(workspace.Blobs()): + print(l) + print(self.FetchBlobWrapper(l)) + # approach 2: only summary + # for param in self.model.params: + # self.model.Summarize(param, [], to_file=1) + # self.model.Summarize(self.model.param_to_grad[param], [], to_file=1) + + def print_weights(self): + for _, l in enumerate(self.emb_w): + # print(l) + print(self.FetchBlobWrapper(l, False)) + if self.weighted_pooling == "learned": + for _, l in enumerate(self.emb_vw): + # print(l) + print(self.FetchBlobWrapper(l, False)) + for _, l in enumerate(self.bot_w): + # print(l) + if self.ndevices > 1: + print(self.FetchBlobWrapper(l, False, device_id=0)) + else: + print(self.FetchBlobWrapper(l)) + for _, l in enumerate(self.top_w): + # print(l) + if self.ndevices > 1: + print(self.FetchBlobWrapper(l, False, device_id=0)) + else: + print(self.FetchBlobWrapper(l)) + + def print_activations(self): + for _, l in enumerate(self.emb_l): + print(l) + print(self.FetchBlobWrapper(l, False)) + for _, l in enumerate(self.bot_l): + print(l) + print(self.FetchBlobWrapper(l)) + print(self.tint) + print(self.FetchBlobWrapper(self.tint)) + for _, l in enumerate(self.top_l): + print(l) + print(self.FetchBlobWrapper(l)) + + +def define_metrics(): + metrics = { + 'loss': lambda y_true, y_score: + sklearn.metrics.log_loss( + y_true=y_true, + y_pred=y_score, + labels=[0,1]), + 'recall': lambda y_true, y_score: + sklearn.metrics.recall_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'precision': lambda y_true, y_score: + sklearn.metrics.precision_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'f1': lambda y_true, y_score: + sklearn.metrics.f1_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'ap': sklearn.metrics.average_precision_score, + 'roc_auc': sklearn.metrics.roc_auc_score, + 'accuracy': lambda y_true, y_score: + sklearn.metrics.accuracy_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + # 'pre_curve' : sklearn.metrics.precision_recall_curve, + # 'roc_curve' : sklearn.metrics.roc_curve, + } + return metrics + + +def calculate_metrics(targets, scores): + scores = np.concatenate(scores, axis=0) + targets = np.concatenate(targets, axis=0) + + metrics = define_metrics() + + # print("Compute time for validation metric : ", end="") + # first_it = True + validation_results = {} + for metric_name, metric_function in metrics.items(): + # if first_it: + # first_it = False + # else: + # print(", ", end="") + # metric_compute_start = time_wrap(False) + try: + validation_results[metric_name] = metric_function( + targets, + scores + ) + except Exception as error : + validation_results[metric_name] = -1 + print("{} in calculating {}".format(error, metric_name)) + # metric_compute_end = time_wrap(False) + # met_time = metric_compute_end - metric_compute_start + # print("{} {:.4f}".format(metric_name, 1000 * (met_time)), + # end="") + # print(" ms") + return validation_results + + +if __name__ == "__main__": + ### import packages ### + import sys + import argparse + + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Train Deep Learning Recommendation Model (DLRM)" + ) + # model related parameters + parser.add_argument("--arch-sparse-feature-size", type=int, default=2) + parser.add_argument("--arch-embedding-size", type=str, default="4-3-2") + parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2") + parser.add_argument("--arch-mlp-top", type=str, default="4-2-1") + parser.add_argument("--arch-interaction-op", type=str, default="dot") + parser.add_argument("--arch-interaction-itself", action="store_true", default=False) + # activations and loss + parser.add_argument("--activation-function", type=str, default="relu") + parser.add_argument("--loss-function", type=str, default="mse") # or bce + parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7 + parser.add_argument("--round-targets", type=bool, default=False) + parser.add_argument("--weighted-pooling", type=str, default=None) + # data + parser.add_argument("--data-size", type=int, default=1) + parser.add_argument("--num-batches", type=int, default=0) + parser.add_argument("--data-generation", type=str, default="random") # or synthetic or dataset + parser.add_argument("--rand-data-dist", type=str, default="uniform") # uniform or gaussian + parser.add_argument("--rand-data-min", type=float, default=0) + parser.add_argument("--rand-data-max", type=float, default=1) + parser.add_argument("--rand-data-mu", type=float, default=-1) + parser.add_argument("--rand-data-sigma", type=float, default=1) + parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log") + parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + parser.add_argument("--data-randomize", type=str, default="total") # or day or none + parser.add_argument("--data-trace-enable-padding", type=bool, default=False) + parser.add_argument("--max-ind-range", type=int, default=-1) + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--num-indices-per-lookup", type=int, default=10) + parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False) + parser.add_argument("--num-workers", type=int, default=0) + parser.add_argument("--memory-map", action="store_true", default=False) + # training + parser.add_argument("--mini-batch-size", type=int, default=1) + parser.add_argument("--nepochs", type=int, default=1) + parser.add_argument("--learning-rate", type=float, default=0.01) + parser.add_argument("--print-precision", type=int, default=5) + parser.add_argument("--numpy-rand-seed", type=int, default=123) + parser.add_argument("--sync-dense-params", type=bool, default=True) + parser.add_argument("--caffe2-net-type", type=str, default="") + parser.add_argument("--optimizer", type=str, default="sgd", + help="""This is the optimizer for embedding tables.""") + parser.add_argument( + "--dataset-multiprocessing", + action="store_true", + default=False, + help="The Kaggle dataset can be multiprocessed in an environment \ + with more than 7 CPU cores and more than 20 GB of memory. \n \ + The Terabyte dataset can be multiprocessed in an environment \ + with more than 24 CPU cores and at least 1 TB of memory.", + ) + # inference + parser.add_argument("--inference-only", action="store_true", default=False) + # onnx (or protobuf with shapes) + parser.add_argument("--save-onnx", action="store_true", default=False) + parser.add_argument("--save-proto-types-shapes", action="store_true", default=False) + # gpu + parser.add_argument("--use-gpu", action="store_true", default=False) + # debugging and profiling + parser.add_argument("--print-freq", type=int, default=1) + parser.add_argument("--test-freq", type=int, default=-1) + parser.add_argument("--test-mini-batch-size", type=int, default=-1) + parser.add_argument("--test-num-workers", type=int, default=-1) + parser.add_argument("--print-time", action="store_true", default=False) + parser.add_argument("--debug-mode", action="store_true", default=False) + parser.add_argument("--enable-profiling", action="store_true", default=False) + parser.add_argument("--plot-compute-graph", action="store_true", default=False) + # mlperf logging (disables other output and stops early) + parser.add_argument("--mlperf-logging", action="store_true", default=False) + # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107 + parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0) + # stop at target AUC Terabyte (no subsampling) 0.8025 + parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0) + args = parser.parse_args() + + if args.dataset_multiprocessing: + assert float(sys.version[:3]) > 3.7, "The dataset_multiprocessing " + \ + "flag is susceptible to a bug in Python 3.7 and under. " + \ + "https://github.com/facebookresearch/dlrm/issues/172" + + ### some basic setup ### + # WARNING: to obtain exactly the same initialization for + # the weights we need to start from the same random seed. + np.random.seed(args.numpy_rand_seed) + + np.set_printoptions(precision=args.print_precision) + if (args.test_mini_batch_size < 0): + # if the parameter is not set, use the training batch size + args.test_mini_batch_size = args.mini_batch_size + if (args.test_num_workers < 0): + # if the parameter is not set, use the same parameter for training + args.test_num_workers = args.num_workers + + use_gpu = args.use_gpu + if use_gpu: + device_opt = core.DeviceOption(workspace.GpuDeviceType, 0) + ngpus = workspace.NumGpuDevices() # 1 + print("Using {} GPU(s)...".format(ngpus)) + else: + device_opt = core.DeviceOption(caffe2_pb2.CPU) + print("Using CPU...") + + ### prepare training data ### + ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") + if args.data_generation == "dataset": + if args.num_workers > 0 or args.test_num_workers > 0: + print("WARNING: non default --num-workers or --test-num-workers options" + + " are not supported and will be ignored") + if args.mini_batch_size != args.test_mini_batch_size: + print("WARNING: non default ----test-mini-batch-size option" + + " is not supported and will be ignored") + + # input and target from dataset + + train_data, train_ld, test_data, test_ld = \ + dp.make_criteo_data_and_loaders( + args, + offset_to_length_converter=True, + ) + + nbatches = args.num_batches if args.num_batches > 0 \ + else len(train_ld) + + nbatches_test = len(test_ld) + + ln_emb = train_data.counts + m_den = train_data.m_den + + # enforce maximum limit on number of vectors per embedding + if args.max_ind_range > 0: + ln_emb = np.array(list(map( + lambda x: x if x < args.max_ind_range else args.max_ind_range, + ln_emb + ))) + ln_bot[0] = m_den + + else: + if args.num_workers > 0 or args.test_num_workers > 0: + print("WARNING: non default --num-workers or --test-num-workers options" + + " are not supported and will be ignored") + if args.mini_batch_size != args.test_mini_batch_size: + print("WARNING: non default ----test-mini-batch-size option" + + " is not supported and will be ignored") + + # input and target at random + ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") + m_den = ln_bot[0] + train_data, train_ld, test_data, test_ld = dp.make_random_data_and_loader(args, ln_emb, m_den, \ + offset_to_length_converter=True, + ) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + nbatches_test = len(test_ld) + # table_feature_map = {idx : idx for idx in range(len(ln_emb))} + + ### parse command line arguments ### + m_spa = args.arch_sparse_feature_size + ln_emb = np.asarray(ln_emb) + num_fea = ln_emb.size + 1 # num sparse + num dense features + m_den_out = ln_bot[ln_bot.size - 1] + if args.arch_interaction_op == "dot": + # approach 1: all + # num_int = num_fea * num_fea + m_den_out + # approach 2: unique + if args.arch_interaction_itself: + num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out + else: + num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out + elif args.arch_interaction_op == "cat": + num_int = num_fea * m_den_out + else: + sys.exit("ERROR: --arch-interaction-op=" + + args.arch_interaction_op + " is not supported") + arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top + ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-") + # sanity check: feature sizes and mlp dimensions must match + if m_den != ln_bot[0]: + sys.exit("ERROR: arch-dense-feature-size " + + str(m_den) + " does not match first dim of bottom mlp " + str(ln_bot[0])) + if m_spa != m_den_out: + sys.exit("ERROR: arch-sparse-feature-size " + + str(m_spa) + " does not match last dim of bottom mlp " + str(m_den_out)) + if num_int != ln_top[0]: + sys.exit("ERROR: # of feature interactions " + + str(num_int) + " does not match first dim of top mlp " + str(ln_top[0])) + + # test prints (model arch) + if args.debug_mode: + print("model arch:") + print("mlp top arch " + str(ln_top.size - 1) + + " layers, with input to output dimensions:") + print(ln_top) + + print("# of interactions") + print(num_int) + print("mlp bot arch " + str(ln_bot.size - 1) + + " layers, with input to output dimensions:") + print(ln_bot) + print("# of features (sparse and dense)") + print(num_fea) + print("dense feature size") + print(m_den) + print("sparse feature size") + print(m_spa) + print("# of embeddings (= # of sparse features) " + str(ln_emb.size) + + ", with dimensions " + str(m_spa) + "x:") + print(ln_emb) + + print("data (inputs and targets):") + for j, inputBatch in enumerate(train_ld): + lX_j, lS_l_j, lS_i_j, lT_j = inputBatch + print("mini-batch: %d" % j) + print(lX_j) + print(lS_l_j) + print(lS_i_j) + print(lT_j) + + ### construct the neural network specified above ### + # WARNING: to obtain exactly the same initialization for + # the weights we need to start from the same random seed. + # np.random.seed(args.numpy_rand_seed) + ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1 + flag_types_shapes = args.save_onnx or args.save_proto_types_shapes + flag_forward_ops = not (use_gpu and ndevices > 1) + with core.DeviceScope(device_opt): + dlrm = DLRM_Net( + m_spa, + ln_emb, + ln_bot, + ln_top, + args.arch_interaction_op, + arch_interaction_itself=args.arch_interaction_itself, + sigmoid_bot=-1, + sigmoid_top=ln_top.size - 1, + save_onnx=flag_types_shapes, + ndevices=ndevices, + # forward_ops = flag_forward_ops + enable_prof=args.enable_profiling, + weighted_pooling=args.weighted_pooling, + emb_optimizer=args.optimizer + ) + # load nccl if using multiple devices + if args.sync_dense_params and ndevices > 1: + dyndep.InitOpsLibrary("//caffe2/caffe2/contrib/nccl:nccl_ops") + # set the net type for better performance (dag, async_scheduling, etc) + if args.caffe2_net_type: + dlrm.parameters().net.Proto().type = args.caffe2_net_type + # plot compute graph + if args.plot_compute_graph: + graph = net_drawer.GetPydotGraph( + dlrm.parameters().net, + "dlrm_s_caffe2_graph", + "BT" + ) + graph.write_pdf(graph.get_name() + ".pdf") + # test prints + if args.debug_mode: + print("initial parameters (weights and bias):") + dlrm.print_weights() + + # add training loss if needed + if not args.inference_only: + with core.DeviceScope(device_opt): + # specify the loss function + nd = 1.0 if dlrm.ndevices <= 1 else 1.0 / dlrm.ndevices # 1 + if args.loss_function == "mse": + dlrm.MSEloss(scale=nd) + elif args.loss_function == "bce": + dlrm.BCEloss(scale=nd, threshold=args.loss_threshold) + else: + sys.exit("ERROR: --loss-function=" + args.loss_function + + " is not supported") + + # define test net (as train net without gradients) + dlrm.test_net = core.Net(copy.deepcopy(dlrm.model.net.Proto())) + + # specify the optimizer algorithm + if args.optimizer == "sgd": + dlrm.sgd_optimizer( + args.learning_rate, sync_dense_params=args.sync_dense_params + ) + elif args.optimizer in ["adagrad", "rwsadagrad"]: + dlrm.adagrad_optimizer( + args.learning_rate, sync_dense_params=args.sync_dense_params + ) + else: + sys.exit("""ERROR: Select an optimizer for + embedding tables : 'sgd', 'adagrad', + or 'rwsadagrad' """) + + # init/create + X, lS_l, lS_i, T = next(iter(train_ld)) # does not affect the enumerate(train_ld) in the main loop + dlrm.create(X, lS_l, lS_i, T.int()) + + ### main loop ### + best_gA_test = 0 + best_auc_test = 0 + total_time = 0 + total_loss = 0 + total_accu = 0 + total_iter = 0 + total_samp = 0 + k = 0 + + print("time/loss/accuracy (if enabled):") + while k < args.nepochs: + j = 0 + for j, inputBatch in enumerate(train_ld): + # forward and backward pass, where the latter runs only + # when gradients and loss have been added to the net + time1 = time.time() + lX_j, lS_l_j, lS_i_j, lT_j = inputBatch + lT_j = lT_j.int() if args.loss_function == "bce" else lT_j + dlrm.run(lX_j, lS_l_j, lS_i_j, lT_j) + + time2 = time.time() + total_time += time2 - time1 + + # compte loss and accuracy + Z = dlrm.get_output() # numpy array + T = lT_j.numpy() + ''' + # debug prints + print("output and loss") + print(Z) + print(dlrm.get_loss()) + ''' + mbs = T.shape[0] # = args.mini_batch_size except maybe for last + A = np.sum((np.round(Z, 0) == T).astype(np.uint8)) + total_accu += 0 if args.inference_only else A + total_loss += 0 if args.inference_only else dlrm.get_loss() * mbs + total_iter += 1 + total_samp += mbs + + # print time, loss and accuracy + should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches) + should_test = ( + (args.test_freq > 0) + and (args.data_generation in ["dataset", "random"]) + and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches)) + ) + if should_print or should_test: + gT = 1000. * total_time / total_iter if args.print_time else -1 + total_time = 0 + + gA = total_accu / total_samp + total_accu = 0 + + gL = total_loss / total_samp + total_loss = 0 + + str_run_type = "inference" if args.inference_only else "training" + print( + "Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format( + str_run_type, j + 1, nbatches, k, gT + ) + + " loss {:.6f}".format(gL) + ) + total_iter = 0 + total_samp = 0 + # debug prints + # print(Z) + # print(T) + + # testing + if should_test and not args.inference_only: + # don't measure training iter time in a test iteration + if args.mlperf_logging: + previous_iteration_time = None + + test_accu = 0 + test_loss = 0 + test_samp = 0 + + if args.mlperf_logging: + scores = [] + targets = [] + + for i, testBatch in enumerate(test_ld): + # early exit if nbatches was set by the user and was exceeded + if nbatches > 0 and i >= nbatches: + break + + # forward pass + + lX_test_i, lS_l_test_i, lS_i_test_i, lT_test_i = testBatch + lT_test_i = lT_test_i.int() if args.loss_function == "bce" else lT_test_i + dlrm.run(lX_test_i, lS_l_test_i, lS_i_test_i, lT_test_i, test_net=True) + + Z_test = dlrm.get_output() + T_test = lT_test_i.numpy() + + if args.mlperf_logging: + scores.append(Z_test) + targets.append(T_test) + else: + # compte loss and accuracy + L_test = dlrm.get_loss() + mbs_test = T_test.shape[0] # = mini_batch_size except last + A_test = np.sum((np.round(Z_test, 0) == T_test).astype(np.uint8)) + test_accu += A_test + test_loss += L_test * mbs_test + test_samp += mbs_test + + # compute metrics (after test loop has finished) + if args.mlperf_logging: + validation_results = calculate_metrics(targets, scores) + gA_test = validation_results['accuracy'] + gL_test = validation_results['loss'] + else: + gA_test = test_accu / test_samp + gL_test = test_loss / test_samp + + # print metrics + is_best = gA_test > best_gA_test + if is_best: + best_gA_test = gA_test + + if args.mlperf_logging: + is_best = validation_results['roc_auc'] > best_auc_test + if is_best: + best_auc_test = validation_results['roc_auc'] + + print( + "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k) + + " loss {:.6f}, recall {:.4f}, precision {:.4f},".format( + validation_results['loss'], + validation_results['recall'], + validation_results['precision'] + ) + + " f1 {:.4f}, ap {:.4f},".format( + validation_results['f1'], + validation_results['ap'], + ) + + " auc {:.4f}, best auc {:.4f},".format( + validation_results['roc_auc'], + best_auc_test + ) + + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format( + validation_results['accuracy'] * 100, + best_gA_test * 100 + ) + ) + else: + print( + "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0) + + " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format( + gL_test, gA_test * 100, best_gA_test * 100 + ) + ) + + # check thresholds + if (args.mlperf_logging + and (args.mlperf_acc_threshold > 0) + and (best_gA_test > args.mlperf_acc_threshold)): + print("MLPerf testing accuracy threshold " + + str(args.mlperf_acc_threshold) + + " reached, stop training") + break + + if (args.mlperf_logging + and (args.mlperf_auc_threshold > 0) + and (best_auc_test > args.mlperf_auc_threshold)): + print("MLPerf testing auc threshold " + + str(args.mlperf_auc_threshold) + + " reached, stop training") + break + + j += 1 # nbatches + k += 1 # nepochs + + # test prints + if not args.inference_only and args.debug_mode: + print("updated parameters (weights and bias):") + dlrm.print_weights() + + # build onnx model from caffe2 + if args.save_onnx: + pnet = dlrm.parameters().net.Proto() + inet = dlrm.parameters().param_init_net.Proto() + value_info = dlrm.onnx_tsd # None + # debug prints + # print(value_info) + + # WARNING: Why Caffe2 to ONNX net transformation currently does not work? + # 1. ONNX does not support SparseLengthsSum operator directly. A workaround + # could be for the Caffe2 ONNX frontend to indirectly map this operator to + # Gather and ReducedSum ONNX operators, following the PyTorch approach. + c2f = caffe2.python.onnx.frontend.Caffe2Frontend() + dlrm_caffe2_onnx = c2f.caffe2_net_to_onnx_model(pnet, inet, value_info) + # check the onnx model + onnx.checker.check_model(dlrm_caffe2_onnx) + + # save model to a file + with open("dlrm_s_caffe2.onnx", "w+") as dlrm_caffe2_onnx_file: + dlrm_caffe2_onnx_file.write(str(dlrm_caffe2_onnx)) + + # build protobuf with types and shapes + if args.save_proto_types_shapes: + # add types and shapes to protobuf + __TYPE_MAPPING = { + onnx.TensorProto.FLOAT: caffe2_pb2.TensorProto.FLOAT, + onnx.TensorProto.UINT8: caffe2_pb2.TensorProto.UINT8, + onnx.TensorProto.INT8: caffe2_pb2.TensorProto.INT8, + onnx.TensorProto.UINT16: caffe2_pb2.TensorProto.UINT16, + onnx.TensorProto.INT16: caffe2_pb2.TensorProto.INT16, + onnx.TensorProto.INT32: caffe2_pb2.TensorProto.INT32, + onnx.TensorProto.INT64: caffe2_pb2.TensorProto.INT64, + onnx.TensorProto.STRING: caffe2_pb2.TensorProto.STRING, + onnx.TensorProto.BOOL: caffe2_pb2.TensorProto.BOOL, + onnx.TensorProto.FLOAT16: caffe2_pb2.TensorProto.FLOAT16, + onnx.TensorProto.DOUBLE: caffe2_pb2.TensorProto.DOUBLE, + } + + pnet = dlrm.parameters().net.Proto() + arg = pnet.arg.add() + arg.name = "input_shape_info" + for i in pnet.external_input: + if i in dlrm.onnx_tsd: + onnx_dtype, shape = dlrm.onnx_tsd[i] + t = arg.tensors.add() + t.name = i + t.data_type = __TYPE_MAPPING[onnx_dtype] + t.dims.extend(shape) + else: + print("Warning: we don't have shape/type info for input: {}".format(i)) + # debug print + # print(pnet) + + # export the protobuf with types and shapes + with open("dlrm_s_caffe2.proto", "w+") as dlrm_s_proto_file: + dlrm_s_proto_file.write(str(pnet)) + + """ + # export the protobuf with types and shapes as well as weights + # see https://github.com/pytorch/pytorch/issues/9533 + #save + net = dlrm.parameters().net + params = dlrm.parameters().params + init_net, predict_net = mobile_exporter.Export(workspace, net, params) + with open("dlrm_s_caffe2.predict", "wb") as dlrm_s_predict_file: + dlrm_s_predict_file.write(predict_net.SerializeToString()) + with open("dlrm_s_caffe2.init", "wb") as dlrm_s_init_file: + dlrm_s_init_file.write(init_net.SerializeToString()) + #load + net_def = caffe2_pb2.NetDef() + init_def= caffe2_pb2.NetDef() + with open("dlrm_s_caffe2.predict", "rb") as dlrm_s_predict_file: + net_def.ParseFromString(dlrm_s_predict_file.read()) + print(net_def) + with open("dlrm_s_caffe2.init", "rb") as dlrm_s_init_file: + init_def.ParseFromString(dlrm_s_init_file.read()) + print(init_def) + """ diff --git a/benchmarks/dlrm/ootb/dlrm_s_pytorch.py b/benchmarks/dlrm/ootb/dlrm_s_pytorch.py new file mode 100644 index 0000000..1774eb4 --- /dev/null +++ b/benchmarks/dlrm/ootb/dlrm_s_pytorch.py @@ -0,0 +1,2511 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: an implementation of a deep learning recommendation model (DLRM) +# The model input consists of dense and sparse features. The former is a vector +# of floating point values. The latter is a list of sparse indices into +# embedding tables, which consist of vectors of floating point values. +# The selected vectors are passed to mlp networks denoted by triangles, +# in some cases the vectors are interacted through operators (Ops). +# +# output: +# vector of values +# model: | +# /\ +# /__\ +# | +# _____________________> Op <___________________ +# / | \ +# /\ /\ /\ +# /__\ /__\ ... /__\ +# | | | +# | Op Op +# | ____/__\_____ ____/__\____ +# | |_Emb_|____|__| ... |_Emb_|__|___| +# input: +# [ dense features ] [sparse indices] , ..., [sparse indices] +# +# More precise definition of model layers: +# 1) fully connected layers of an mlp +# z = f(y) +# y = Wx + b +# +# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk]) +# z = Op(e1,...,ek) +# obtain vectors e1=E[:,p1], ..., ek=E[:,pk] +# +# 3) Operator Op can be one of the following +# Sum(e1,...,ek) = e1 + ... + ek +# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek] +# Cat(e1,...,ek) = [e1', ..., ek']' +# where ' denotes transpose operation +# +# References: +# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang, +# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, +# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii, +# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko, +# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong, +# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and +# Recommendation Systems", CoRR, arXiv:1906.00091, 2019 + +# TERMS: +# +# qr_ quotient-remainder trick +# md_ mixed-dimension trick +# lS_i Indices used as inputs to embedding bag operators. Indices determine +# which embeddings to select. +# lS_o Offsets used as inputs to embedding bag operators. Offsets determine how +# the selected embeddings are grouped together for the 'mode' operation. +# (Mode operation examples: sum, mean, max) + +from __future__ import absolute_import, division, print_function, unicode_literals + +import argparse + +# miscellaneous +import builtins +import datetime +import json +import sys +import time +import itertools +import traceback + +# onnx +# The onnx import causes deprecation warnings every time workers +# are spawned during testing. So, we filter out those warnings. +import warnings + +# data generation +import dlrm_data_pytorch as dp + +# For distributed run +import extend_distributed as ext_dist +import mlperf_logger + +# numpy +import numpy as np +import optim.rwsadagrad as RowWiseSparseAdagrad +import sklearn.metrics + +# pytorch +import torch +import torch.nn as nn +from torch._ops import ops +from torch.autograd.profiler import record_function +from torch.nn.parallel.parallel_apply import parallel_apply +from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.scatter_gather import gather, scatter +from torch.nn.parameter import Parameter +from torch.optim.lr_scheduler import _LRScheduler +from torch.utils.tensorboard import SummaryWriter + +try: + import fbgemm_gpu + from fbgemm_gpu import split_table_batched_embeddings_ops + from fbgemm_gpu.split_table_batched_embeddings_ops import ( + CacheAlgorithm, + PoolingMode, + OptimType, + SparseType, + SplitTableBatchedEmbeddingBagsCodegen, + IntNBitTableBatchedEmbeddingBagsCodegen, + ) +except (ImportError, OSError): + fbgemm_gpu_import_error_msg = traceback.format_exc() + fbgemm_gpu = None + +try: + import apex +except (ImportError, OSError): + apex_import_error_msg = traceback.format_exc() + apex = None + +try: + import torch2trt + from torch2trt import torch2trt +except (ImportError, OSError): + torch2trt_import_error_msg = traceback.format_exc() + torch2trt = None + +# mixed-dimension trick +from tricks.md_embedding_bag import PrEmbeddingBag, md_solver + +# FB5 Logger +import pathlib +from os import fspath +p = pathlib.Path(__file__).parent.resolve() / "../../../fb5logging" +sys.path.append(fspath(p)) +from fb5logger import FB5Logger +import loggerconstants + +# quotient-remainder trick +from tricks.qr_embedding_bag import QREmbeddingBag + +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + try: + import onnx + except ImportError as error: + print("Unable to import onnx. ", error) + +# from torchviz import make_dot +# import torch.nn.functional as Functional +# from torch.nn.parameter import Parameter + +exc = getattr(builtins, "IOError", "FileNotFoundError") + + +def time_wrap(use_gpu): + if use_gpu: + torch.cuda.synchronize() + return time.time() + + +def dlrm_wrap(X, lS_o, lS_i, use_gpu, device, ndevices=1): + with record_function("DLRM forward"): + if use_gpu: # .cuda() + # lS_i can be either a list of tensors or a stacked tensor. + # Handle each case below: + if ndevices == 1: + lS_i = ( + [S_i.to(device) for S_i in lS_i] + if isinstance(lS_i, list) + else lS_i.to(device) + ) + lS_o = ( + [S_o.to(device) for S_o in lS_o] + if isinstance(lS_o, list) + else lS_o.to(device) + ) + return dlrm(X.to(device), lS_o, lS_i) + + +def loss_fn_wrap(Z, T, use_gpu, device): + with record_function("DLRM loss compute"): + if args.loss_function == "mse" or args.loss_function == "bce": + return dlrm.loss_fn(Z, T.to(device)) + elif args.loss_function == "wbce": + loss_ws_ = dlrm.loss_ws[T.data.view(-1).long()].view_as(T).to(device) + loss_fn_ = dlrm.loss_fn(Z, T.to(device)) + loss_sc_ = loss_ws_ * loss_fn_ + return loss_sc_.mean() + + +# The following function is a wrapper to avoid checking this multiple times in th +# loop below. +def unpack_batch(b): + # Experiment with unweighted samples + return b[0], b[1], b[2], b[3], torch.ones(b[3].size()), None + + +class LRPolicyScheduler(_LRScheduler): + def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps): + self.num_warmup_steps = num_warmup_steps + self.decay_start_step = decay_start_step + self.decay_end_step = decay_start_step + num_decay_steps + self.num_decay_steps = num_decay_steps + + if self.decay_start_step < self.num_warmup_steps: + sys.exit("Learning rate warmup must finish before the decay starts") + + super(LRPolicyScheduler, self).__init__(optimizer) + + def get_lr(self): + step_count = self._step_count + if step_count < self.num_warmup_steps: + # warmup + scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps + lr = [base_lr * scale for base_lr in self.base_lrs] + self.last_lr = lr + elif self.decay_start_step <= step_count and step_count < self.decay_end_step: + # decay + decayed_steps = step_count - self.decay_start_step + scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2 + min_lr = 0.0000001 + lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs] + self.last_lr = lr + else: + if self.num_decay_steps > 0: + # freeze at last, either because we're after decay + # or because we're between warmup and decay + lr = self.last_lr + else: + # do not adjust + lr = self.base_lrs + return lr + + +# quantize_fbgemm_gpu_embedding_bag is partially lifted from +# fbgemm_gpu/test/split_embedding_inference_converter.py, def _quantize_split_embs. +# Converts SplitTableBatchedEmbeddingBagsCodegen to IntNBitTableBatchedEmbeddingBagsCodegen +def quantize_fbgemm_gpu_embedding_bag(model, quantize_type, device): + embedding_specs = [] + if device.type == "cpu": + emb_location = split_table_batched_embeddings_ops.EmbeddingLocation.HOST + else: + emb_location = split_table_batched_embeddings_ops.EmbeddingLocation.DEVICE + + for (E, D, _, _) in model.embedding_specs: + weights_ty = quantize_type + if D % weights_ty.align_size() != 0: + assert D % 4 == 0 + weights_ty = ( + SparseType.FP16 + ) # fall back to FP16 if dimension couldn't be aligned with the required size + embedding_specs.append(("", E, D, weights_ty, emb_location)) + + q_model = ( + split_table_batched_embeddings_ops.IntNBitTableBatchedEmbeddingBagsCodegen( + embedding_specs=embedding_specs, + pooling_mode=model.pooling_mode, + device=device, + ) + ) + q_model.initialize_weights() + for t, (_, _, _, weight_ty, _) in enumerate(embedding_specs): + if weight_ty == SparseType.FP16: + original_weight = model.split_embedding_weights()[t] + q_weight = original_weight.half() + weights = torch.tensor(q_weight.cpu().numpy().view(np.uint8)) + q_model.split_embedding_weights()[t][0].data.copy_(weights) + + elif weight_ty == SparseType.INT8: + original_weight = model.split_embedding_weights()[t] + q_weight = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized( + original_weight + ) + weights = q_weight[:, :-8] + scale_shift = torch.tensor( + q_weight[:, -8:] + .contiguous() + .cpu() + .numpy() + .view(np.float32) + .astype(np.float16) + .view(np.uint8) + ) + q_model.split_embedding_weights()[t][0].data.copy_(weights) + q_model.split_embedding_weights()[t][1].data.copy_(scale_shift) + + elif weight_ty == SparseType.INT4 or weight_ty == SparseType.INT2: + original_weight = model.split_embedding_weights()[t] + q_weight = torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf( + original_weight, + bit_rate=quantize_type.bit_rate(), + ) + weights = q_weight[:, :-4] + scale_shift = torch.tensor( + q_weight[:, -4:].contiguous().cpu().numpy().view(np.uint8) + ) + q_model.split_embedding_weights()[t][0].data.copy_(weights) + q_model.split_embedding_weights()[t][1].data.copy_(scale_shift) + return q_model + + +def create_fbgemm_gpu_emb_bag( + device, + emb_l, + m_spa, + quantize_bits, + learning_rate, + codegen_preference=None, + requires_grad=True, +): + if isinstance(emb_l[0], PrEmbeddingBag): + emb_l = [e.embs for e in emb_l] + if isinstance(emb_l[0], nn.EmbeddingBag): + emb_l = [e.weight for e in emb_l] + Es = [e.shape[0] for e in emb_l] + + if isinstance(m_spa, list): + Ds = m_spa + else: + Ds = [m_spa for _ in emb_l] + + if device.type == "cpu": + emb_location = split_table_batched_embeddings_ops.EmbeddingLocation.HOST + compute_device = split_table_batched_embeddings_ops.ComputeDevice.CPU + else: + emb_location = split_table_batched_embeddings_ops.EmbeddingLocation.DEVICE + compute_device = split_table_batched_embeddings_ops.ComputeDevice.CUDA + pooling_mode = PoolingMode.SUM + cache_algorithm = CacheAlgorithm.LRU + + sparse_type_dict = { + 4: SparseType.INT4, + 8: SparseType.INT8, + 16: SparseType.FP16, + 32: SparseType.FP32, + } + codegen_type_dict = { + 4: "IntN", + 8: "Split" if codegen_preference != "IntN" else "IntN", + 16: "Split" if codegen_preference != "IntN" else "IntN", + 32: "Split", + } + + codegen_type = codegen_type_dict[quantize_bits] + quantize_type = sparse_type_dict[quantize_bits] + if codegen_type == "IntN": + # Create non-quantized model and then call quantize_fbgemm_gpu_embedding_bag + fbgemm_gpu_emb_bag = SplitTableBatchedEmbeddingBagsCodegen( + embedding_specs=[ + ( + E, # num of rows in the table + D, # num of columns in the table + split_table_batched_embeddings_ops.EmbeddingLocation.HOST, + split_table_batched_embeddings_ops.ComputeDevice.CPU, + ) + for (E, D) in zip(Es, Ds) + ], + weights_precision=SparseType.FP32, + optimizer=OptimType.EXACT_SGD, + learning_rate=learning_rate, + cache_algorithm=cache_algorithm, + pooling_mode=pooling_mode, + ).to(device) + if quantize_type == quantize_type.FP16: + weights = fbgemm_gpu_emb_bag.split_embedding_weights() + for i, emb in enumerate(weights): + emb.data.copy_(emb_l[i]) + + elif quantize_type == quantize_type.INT8: + # copy quantized values upsampled/recasted to FP32 + for i in range(len(Es)): + fbgemm_gpu_emb_bag.split_embedding_weights()[i].data.copy_( + torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloat(emb_l[i]) + ) + elif quantize_type == quantize_type.INT4: + # copy quantized values upsampled/recasted to FP32 + for i in range(len(Es)): + fbgemm_gpu_emb_bag.split_embedding_weights()[i].data.copy_( + torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloat( + emb_l[i], + bit_rate=quantize_type.bit_rate(), + ) + ) + fbgemm_gpu_emb_bag = quantize_fbgemm_gpu_embedding_bag( + fbgemm_gpu_emb_bag, quantize_type, device + ) + else: + fbgemm_gpu_emb_bag = SplitTableBatchedEmbeddingBagsCodegen( + embedding_specs=[ + ( + E, # num of rows in the table + D, # num of columns in the table + emb_location, + compute_device, + ) + for (E, D) in zip(Es, Ds) + ], + weights_precision=quantize_type, + optimizer=OptimType.EXACT_SGD, + learning_rate=learning_rate, + cache_algorithm=cache_algorithm, + pooling_mode=pooling_mode, + ).to(device) + + weights = fbgemm_gpu_emb_bag.split_embedding_weights() + for i, emb in enumerate(weights): + emb.data.copy_(emb_l[i]) + + if not requires_grad: + torch.no_grad() + torch.set_grad_enabled(False) + + return fbgemm_gpu_emb_bag + + +# The purpose of this wrapper is to encapsulate the format conversions to/from fbgemm_gpu +# so parallel_apply() executes the format-in -> fbgemm_gpu op -> format-out instructions +# for each respective GPU in parallel. +class fbgemm_gpu_emb_bag_wrapper(nn.Module): + def __init__( + self, + device, + emb_l, + m_spa, + quantize_bits, + learning_rate, + codegen_preference, + requires_grad, + ): + super(fbgemm_gpu_emb_bag_wrapper, self).__init__() + self.fbgemm_gpu_emb_bag = create_fbgemm_gpu_emb_bag( + device, + emb_l, + m_spa, + quantize_bits, + learning_rate, + codegen_preference, + requires_grad, + ) + self.device = device + self.m_spa = m_spa + # create cumsum array for mixed dimension support + if isinstance(m_spa, list): + self.m_spa_cumsum = np.cumsum([0] + m_spa) + if not requires_grad: + torch.no_grad() + torch.set_grad_enabled(False) + + def forward(self, lS_o, lS_i, v_W_l=None): + + # convert offsets to fbgemm format + lengths_list = list(map(len, lS_i)) + indices_lengths_cumsum = np.cumsum([0] + lengths_list) + if isinstance(lS_o, list): + lS_o = torch.stack(lS_o) + lS_o = lS_o.to(self.device) + lS_o += torch.from_numpy(indices_lengths_cumsum[:-1, np.newaxis]).to( + self.device + ) + numel = torch.tensor([indices_lengths_cumsum[-1]], dtype=torch.long).to( + self.device + ) + lS_o = torch.cat((lS_o.flatten(), numel)) + + # create per_sample_weights + if v_W_l: + per_sample_weights = torch.cat( + [a.gather(0, b) for a, b in zip(v_W_l, lS_i)] + ) + else: + per_sample_weights = None + + # convert indices to fbgemm_gpu format + if isinstance(lS_i, torch.Tensor): + lS_i = [lS_i] + lS_i = torch.cat(lS_i, dim=0).to(self.device) + + if isinstance(self.fbgemm_gpu_emb_bag, IntNBitTableBatchedEmbeddingBagsCodegen): + lS_o = lS_o.int() + lS_i = lS_i.int() + + # gpu embedding bag op + ly = self.fbgemm_gpu_emb_bag(lS_i, lS_o, per_sample_weights) + + # convert the results to the next layer's input format. + if isinstance(self.m_spa, list): + # handle mixed dimensions case. + ly = [ + ly[:, s:e] + for (s, e) in zip(self.m_spa_cumsum[:-1], self.m_spa_cumsum[1:]) + ] + else: + # handle case in which all tables share the same column dimension. + cols = self.m_spa + ntables = len(self.fbgemm_gpu_emb_bag.embedding_specs) + ly = ly.reshape(-1, ntables, cols).swapaxes(0, 1) + ly = list(ly) + return ly + + +### define dlrm in PyTorch ### +class DLRM_Net(nn.Module): + def create_mlp(self, ln, sigmoid_layer): + # build MLP layer by layer + layers = nn.ModuleList() + layers.training = self.requires_grad + for i in range(0, ln.size - 1): + n = ln[i] + m = ln[i + 1] + + # construct fully connected operator + LL = nn.Linear(int(n), int(m), bias=True) + + # initialize the weights + # with torch.no_grad(): + # custom Xavier input, output or two-sided fill + mean = 0.0 # std_dev = np.sqrt(variance) + std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n) + W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32) + std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1)) + bt = np.random.normal(mean, std_dev, size=m).astype(np.float32) + # approach 1 + LL.weight.data = torch.tensor(W) + LL.weight.requires_grad = self.requires_grad + LL.bias.data = torch.tensor(bt) + LL.bias.requires_grad = self.requires_grad + # approach 2 + # LL.weight.data.copy_(torch.tensor(W)) + # LL.bias.data.copy_(torch.tensor(bt)) + # approach 3 + # LL.weight = Parameter(torch.tensor(W),requires_grad=True) + # LL.bias = Parameter(torch.tensor(bt),requires_grad=True) + layers.append(LL) + + # construct sigmoid or relu operator + if i == sigmoid_layer: + layers.append(nn.Sigmoid()) + else: + layers.append(nn.ReLU()) + + # approach 1: use ModuleList + # return layers + # approach 2: use Sequential container to wrap all layers + return torch.nn.Sequential(*layers) + + def create_emb(self, m, ln, weighted_pooling=None): + # create_emb parameter description + # + # ln parameter: + # ln is a list of all the tables' row counts. E.g. [10,5,16] would mean + # table 0 has 10 rows, table 1 has 5 rows, and table 2 has 16 rows. + # + # m parameter (when m is a single value): + # m is the length of all embedding vectors. All embedding vectors in all + # embedding tables are created to be the same length. E.g. if ln were [3,2,5] + # and m were 4, table 0 would be dimension 3 x 4, table 1 would be 2 x 4, + # and table 2 would be 5 x 4. + # + # m parameter (when m is a list): + # m is a list of all the tables' column counts. E.g. if m were [4,5,6] and + # ln were [3,2,5], table 0 would be dimension 3 x 4, table 1 would be 2 x 5, + # and table 2 would be 5 x 6. + # + # Key to remember: + # embedding table i has shape: ln[i] rows, m columns, when m is a single value. + # embedding table i has shape: ln[i] rows, m[i] columns, when m is a list. + + emb_l = nn.ModuleList() + v_W_l = [] + for i in range(0, ln.size): + if ext_dist.my_size > 1: + if i not in self.local_emb_indices: + continue + n = ln[i] + + # construct embedding operator + if self.qr_flag and n > self.qr_threshold: + EE = QREmbeddingBag( + n, + m, + self.qr_collisions, + operation=self.qr_operation, + mode="sum", + sparse=True, + ) + elif self.md_flag and n > self.md_threshold: + base = max(m) + _m = m[i] if n > self.md_threshold else base + EE = PrEmbeddingBag(n, _m, base) + # use np initialization as below for consistency... + W = np.random.uniform( + low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m) + ).astype(np.float32) + EE.embs.weight.data = torch.tensor(W, requires_grad=self.requires_grad) + else: + EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True) + # initialize embeddings + # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n)) + W = np.random.uniform( + low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m) + ).astype(np.float32) + # approach 1 + EE.weight.data = torch.tensor(W, requires_grad=self.requires_grad) + # approach 2 + # EE.weight.data.copy_(torch.tensor(W)) + # approach 3 + # EE.weight = Parameter(torch.tensor(W),requires_grad=True) + if weighted_pooling is None: + v_W_l.append(None) + else: + v_W_l.append(torch.ones(n, dtype=torch.float32)) + emb_l.append(EE) + return emb_l, v_W_l + + def __init__( + self, + m_spa=None, + ln_emb=None, + ln_bot=None, + ln_top=None, + arch_interaction_op=None, + arch_interaction_itself=False, + sigmoid_bot=-1, + sigmoid_top=-1, + sync_dense_params=True, + loss_threshold=0.0, + ndevices=-1, + qr_flag=False, + qr_operation="mult", + qr_collisions=0, + qr_threshold=200, + md_flag=False, + md_threshold=200, + weighted_pooling=None, + loss_function="bce", + learning_rate=0.1, + use_gpu=False, + use_fbgemm_gpu=False, + fbgemm_gpu_codegen_pref="Split", + inference_only=False, + quantize_mlp_with_bit=False, + quantize_emb_with_bit=False, + ): + super(DLRM_Net, self).__init__() + + if ( + (m_spa is not None) + and (ln_emb is not None) + and (ln_bot is not None) + and (ln_top is not None) + and (arch_interaction_op is not None) + ): + # save arguments + self.ntables = len(ln_emb) + self.m_spa = m_spa + self.use_gpu = use_gpu + self.use_fbgemm_gpu = use_fbgemm_gpu + self.fbgemm_gpu_codegen_pref = fbgemm_gpu_codegen_pref + self.requires_grad = not inference_only + self.ndevices_available = ndevices + self.ndevices_in_use = ndevices + self.output_d = 0 + self.add_new_weights_to_params = False + self.arch_interaction_op = arch_interaction_op + self.arch_interaction_itself = arch_interaction_itself + self.sync_dense_params = sync_dense_params and not inference_only + self.loss_threshold = loss_threshold + self.loss_function = loss_function + self.learning_rate = learning_rate + if weighted_pooling is not None and weighted_pooling != "fixed": + self.weighted_pooling = "learned" + else: + self.weighted_pooling = weighted_pooling + # create variables for QR embedding if applicable + self.qr_flag = qr_flag + if self.qr_flag: + self.qr_collisions = qr_collisions + self.qr_operation = qr_operation + self.qr_threshold = qr_threshold + # create variables for MD embedding if applicable + self.md_flag = md_flag + if self.md_flag: + self.md_threshold = md_threshold + + # If running distributed, get local slice of embedding tables + if ext_dist.my_size > 1: + n_emb = len(ln_emb) + if n_emb < ext_dist.my_size: + sys.exit( + "only (%d) sparse features for (%d) devices, table partitions will fail" + % (n_emb, ext_dist.my_size) + ) + self.n_global_emb = n_emb + self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths( + n_emb + ) + self.local_emb_slice = ext_dist.get_my_slice(n_emb) + self.local_emb_indices = list(range(n_emb))[self.local_emb_slice] + + # create operators + self.emb_l, self.v_W_l = self.create_emb(m_spa, ln_emb, weighted_pooling) + if self.weighted_pooling == "learned": + self.v_W_l = nn.ParameterList(list(map(Parameter, self.v_W_l))) + + self.bot_l = self.create_mlp(ln_bot, sigmoid_bot) + self.top_l = self.create_mlp(ln_top, sigmoid_top) + + # quantization + self.quantize_emb = False + self.emb_l_q = [] + self.quantize_bits = 32 + + # fbgemm_gpu + self.fbgemm_emb_l = [] + self.v_W_l_l = [self.v_W_l] if self.weighted_pooling else [None] + + self.interact_features_l = [] + + # specify the loss function + if self.loss_function == "mse": + self.loss_fn = torch.nn.MSELoss(reduction="mean") + elif self.loss_function == "bce": + self.loss_fn = torch.nn.BCELoss(reduction="mean") + elif self.loss_function == "wbce": + self.loss_ws = torch.tensor( + np.fromstring(args.loss_weights, dtype=float, sep="-") + ) + self.loss_fn = torch.nn.BCELoss(reduction="none") + else: + sys.exit( + "ERROR: --loss-function=" + self.loss_function + " is not supported" + ) + + def prepare_parallel_model(self, ndevices): + device_ids = range(ndevices) + # replicate mlp (data parallelism) + self.bot_l_replicas = replicate(self.bot_l, device_ids) + self.top_l_replicas = replicate(self.top_l, device_ids) + + # distribute embeddings (model parallelism) + if self.weighted_pooling is not None: + for k, w in enumerate(self.v_W_l): + self.v_W_l[k] = Parameter( + w.to(torch.device("cuda:" + str(k % ndevices))) + ) + if not self.use_fbgemm_gpu: + for k, w in enumerate(self.emb_l): + self.emb_l[k] = w.to(torch.device("cuda:" + str(k % ndevices))) + else: + self.fbgemm_emb_l, self.v_W_l_l = zip( + *[ + ( + fbgemm_gpu_emb_bag_wrapper( + torch.device("cuda:" + str(k)), + self.emb_l[k::ndevices] + if self.emb_l + else self.emb_l_q[k::ndevices], + self.m_spa[k::ndevices] + if isinstance(self.m_spa, list) + else self.m_spa, + self.quantize_bits, + self.learning_rate, + self.fbgemm_gpu_codegen_pref, + self.requires_grad, + ), + self.v_W_l[k::ndevices] if self.weighted_pooling else None, + ) + for k in range(ndevices) + ] + ) + self.add_new_weights_to_params = True + self.interact_features_l = [self.nn_module_wrapper() for _ in range(ndevices)] + + # nn_module_wrapper is used to call functions concurrently across multi-gpus, using parallel_apply, + # which requires an nn.Module subclass. + class nn_module_wrapper(nn.Module): + def __init__(self): + super(DLRM_Net.nn_module_wrapper, self).__init__() + def forward(self, E, x, ly): + return E(x, ly) + + def apply_mlp(self, x, layers): + # approach 1: use ModuleList + # for layer in layers: + # x = layer(x) + # return x + # approach 2: use Sequential container to wrap all layers + return layers(x) + + def apply_emb(self, lS_o, lS_i): + # WARNING: notice that we are processing the batch at once. We implicitly + # assume that the data is laid out such that: + # 1. each embedding is indexed with a group of sparse indices, + # corresponding to a single lookup + # 2. for each embedding the lookups are further organized into a batch + # 3. for a list of embedding tables there is a list of batched lookups + + if self.use_fbgemm_gpu: + # Deinterleave and reshape to 2d, so items are grouped by device + # per row. Then parallel apply. + ndevices = len(self.fbgemm_emb_l) + lS_o_l = [lS_o[k::ndevices] for k in range(ndevices)] + lS_i_l = [lS_i[k::ndevices] for k in range(ndevices)] + ly = parallel_apply( + self.fbgemm_emb_l, list(zip(lS_o_l, lS_i_l, self.v_W_l_l)) + ) + # Interleave and flatten to match non-fbgemm_gpu ly format. + ly = [ly[i % ndevices][i // ndevices] for i in range(self.ntables)] + else: + ly = [] + for k, sparse_index_group_batch in enumerate(lS_i): + sparse_offset_group_batch = lS_o[k] + + # embedding lookup + # We are using EmbeddingBag, which implicitly uses sum operator. + # The embeddings are represented as tall matrices, with sum + # happening vertically across 0 axis, resulting in a row vector + # E = emb_l[k] + + if self.v_W_l[k] is not None: + per_sample_weights = self.v_W_l[k].gather( + 0, sparse_index_group_batch + ) + else: + per_sample_weights = None + + if self.quantize_emb: + if self.quantize_bits == 4: + E = ops.quantized.embedding_bag_4bit_rowwise_offsets + elif self.quantize_bits == 8: + E = ops.quantized.embedding_bag_byte_rowwise_offsets + QV = E( + self.emb_l_q[k], + sparse_index_group_batch, + sparse_offset_group_batch, + per_sample_weights=per_sample_weights, + ) + + ly.append(QV) + else: + E = self.emb_l[k] + V = E( + sparse_index_group_batch, + sparse_offset_group_batch, + per_sample_weights=per_sample_weights, + ) + + ly.append(V) + + # print(ly) + return ly + + # using quantizing functions from caffe2/aten/src/ATen/native/quantized/cpu + def quantize_embedding(self, bits): + + n = len(self.emb_l) + self.emb_l_q = [None] * n + for k in range(n): + if bits == 4: + self.emb_l_q[k] = ops.quantized.embedding_bag_4bit_prepack( + self.emb_l[k].weight + ) + elif bits == 8: + self.emb_l_q[k] = ops.quantized.embedding_bag_byte_prepack( + self.emb_l[k].weight + ) + elif bits == 16: + self.emb_l_q[k] = self.emb_l[k].half().weight + else: + return + self.emb_l = None + self.quantize_emb = True + self.quantize_bits = bits + + def interact_features(self, x, ly): + + if self.arch_interaction_op == "dot": + # concatenate dense and sparse features + (batch_size, d) = x.shape + T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d)) + # perform a dot product + Z = torch.bmm(T, torch.transpose(T, 1, 2)) + # append dense feature with the interactions (into a row vector) + # approach 1: all + # Zflat = Z.view((batch_size, -1)) + # approach 2: unique + _, ni, nj = Z.shape + # approach 1: tril_indices + # offset = 0 if self.arch_interaction_itself else -1 + # li, lj = torch.tril_indices(ni, nj, offset=offset) + # approach 2: custom + offset = 1 if self.arch_interaction_itself else 0 + li = torch.tensor([i for i in range(ni) for j in range(i + offset)]) + lj = torch.tensor([j for i in range(nj) for j in range(i + offset)]) + Zflat = Z[:, li, lj] + # concatenate dense features and interactions + R = torch.cat([x] + [Zflat], dim=1) + elif self.arch_interaction_op == "cat": + # concatenation features (into a row vector) + R = torch.cat([x] + ly, dim=1) + else: + sys.exit( + "ERROR: --arch-interaction-op=" + + self.arch_interaction_op + + " is not supported" + ) + + return R + + def forward(self, dense_x, lS_o, lS_i): + if ext_dist.my_size > 1: + # multi-node multi-device run + return self.distributed_forward(dense_x, lS_o, lS_i) + elif self.ndevices_available <= 1: + # single device run + return self.sequential_forward(dense_x, lS_o, lS_i) + else: + # single-node multi-device run + return self.parallel_forward(dense_x, lS_o, lS_i) + + def distributed_forward(self, dense_x, lS_o, lS_i): + batch_size = dense_x.size()[0] + # WARNING: # of ranks must be <= batch size in distributed_forward call + if batch_size < ext_dist.my_size: + sys.exit( + "ERROR: batch_size (%d) must be larger than number of ranks (%d)" + % (batch_size, ext_dist.my_size) + ) + if batch_size % ext_dist.my_size != 0: + sys.exit( + "ERROR: batch_size %d can not split across %d ranks evenly" + % (batch_size, ext_dist.my_size) + ) + + dense_x = dense_x[ext_dist.get_my_slice(batch_size)] + lS_o = lS_o[self.local_emb_slice] + lS_i = lS_i[self.local_emb_slice] + + if (self.ntables != len(lS_o)) or (self.ntables != len(lS_i)): + sys.exit( + "ERROR: corrupted model input detected in distributed_forward call" + ) + + # embeddings + with record_function("DLRM embedding forward"): + ly = self.apply_emb(lS_o, lS_i) + + # WARNING: Note that at this point we have the result of the embedding lookup + # for the entire batch on each rank. We would like to obtain partial results + # corresponding to all embedding lookups, but part of the batch on each rank. + # Therefore, matching the distribution of output of bottom mlp, so that both + # could be used for subsequent interactions on each device. + if self.ntables != len(ly): + sys.exit("ERROR: corrupted intermediate result in distributed_forward call") + + a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank) + + with record_function("DLRM bottom nlp forward"): + x = self.apply_mlp(dense_x, self.bot_l) + + ly = a2a_req.wait() + ly = list(ly) + + # interactions + with record_function("DLRM interaction forward"): + z = self.interact_features(x, ly) + + # top mlp + with record_function("DLRM top nlp forward"): + p = self.apply_mlp(z, self.top_l) + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)) + else: + z = p + + return z + + def sequential_forward(self, dense_x, lS_o, lS_i): + # process dense features (using bottom mlp), resulting in a row vector + x = self.apply_mlp(dense_x, self.bot_l) + # debug prints + # print("intermediate") + # print(x.detach().cpu().numpy()) + + # process sparse features(using embeddings), resulting in a list of row vectors + ly = self.apply_emb(lS_o, lS_i) + # for y in ly: + # print(y.detach().cpu().numpy()) + + # interact features (dense and sparse) + z = self.interact_features(x, ly) + # print(z.detach().cpu().numpy()) + + # obtain probability of a click (using top mlp) + p = self.apply_mlp(z, self.top_l) + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)) + else: + z = p + + return z + + def parallel_forward(self, dense_x, lS_o, lS_i): + ### prepare model (overwrite) ### + # WARNING: # of devices must be >= batch size in parallel_forward call + batch_size = dense_x.size()[0] + ndevices = min(self.ndevices_available, batch_size, self.ntables) + device_ids = range(ndevices) + # WARNING: must redistribute the model if mini-batch size changes(this is common + # for last mini-batch, when # of elements in the dataset/batch size is not even + if self.ndevices_in_use != ndevices: + self.ndevices_in_use = ndevices + self.prepare_parallel_model(ndevices) + elif self.sync_dense_params: + # When training, replicate the new/updated mlp weights each iteration. + # For inference-only, this code should never run. + self.bot_l_replicas = replicate(self.bot_l, device_ids) + self.top_l_replicas = replicate(self.top_l, device_ids) + + ### prepare input (overwrite) ### + # scatter dense features (data parallelism) + # print(dense_x.device) + dense_x = scatter(dense_x, device_ids, dim=0) + # distribute sparse features (model parallelism) + if (self.ntables != len(lS_o)) or (self.ntables != len(lS_i)): + sys.exit("ERROR: corrupted model input detected in parallel_forward call") + + lS_o = [ + lS_o[k].to(torch.device("cuda:" + str(k % ndevices))) + for k in range(self.ntables) + ] + lS_i = [ + lS_i[k].to(torch.device("cuda:" + str(k % ndevices))) + for k in range(self.ntables) + ] + + ### compute results in parallel ### + # bottom mlp + # WARNING: Note that the self.bot_l is a list of bottom mlp modules + # that have been replicated across devices, while dense_x is a tuple of dense + # inputs that has been scattered across devices on the first (batch) dimension. + # The output is a list of tensors scattered across devices according to the + # distribution of dense_x. + x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids) + # debug prints + # print(x) + + # embeddings + ly = self.apply_emb(lS_o, lS_i) + # debug prints + # print(ly) + + # butterfly shuffle (implemented inefficiently for now) + # WARNING: Note that at this point we have the result of the embedding lookup + # for the entire batch on each device. We would like to obtain partial results + # corresponding to all embedding lookups, but part of the batch on each device. + # Therefore, matching the distribution of output of bottom mlp, so that both + # could be used for subsequent interactions on each device. + if self.ntables != len(ly): + sys.exit("ERROR: corrupted intermediate result in parallel_forward call") + + t_list = [scatter(ly[k], device_ids, dim=0) for k in range(self.ntables)] + + # adjust the list to be ordered per device + ly = list(map(lambda y: list(y), zip(*t_list))) + # debug prints + # print(ly) + + # interactions + z = parallel_apply(self.interact_features_l, list(zip(itertools.repeat(self.interact_features),x,ly))) + # debug prints + # print(z) + + # top mlp + # WARNING: Note that the self.top_l is a list of top mlp modules that + # have been replicated across devices, while z is a list of interaction results + # that by construction are scattered across devices on the first (batch) dim. + # The output is a list of tensors scattered across devices according to the + # distribution of z. + p = parallel_apply(self.top_l_replicas, z, None, device_ids) + + ### gather the distributed results ### + p0 = gather(p, self.output_d, dim=0) + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z0 = torch.clamp( + p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold) + ) + else: + z0 = p0 + + return z0 + + def print_weights(self): + if self.use_fbgemm_gpu and len(self.fbgemm_emb_l): + ntables_l = [ + len(e.fbgemm_gpu_emb_bag.embedding_specs) for e in self.fbgemm_emb_l + ] + for j in range(ntables_l[0] + 1): + for k, e in enumerate(self.fbgemm_emb_l): + if j < ntables_l[k]: + print( + e.fbgemm_gpu_emb_bag.split_embedding_weights()[j] + .detach() + .cpu() + .numpy() + ) + elif self.quantize_bits != 32: + for e in self.emb_l_q: + print(e.data.detach().cpu().numpy()) + else: # if self.emb_l: + for param in self.emb_l.parameters(): + print(param.detach().cpu().numpy()) + if isinstance(self.v_W_l, nn.ParameterList): + for param in self.v_W_l.parameters(): + print(param.detach().cpu().numpy()) + for param in self.bot_l.parameters(): + print(param.detach().cpu().numpy()) + for param in self.top_l.parameters(): + print(param.detach().cpu().numpy()) + + +def dash_separated_ints(value): + vals = value.split("-") + for val in vals: + try: + int(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of ints" % value + ) + + return value + + +def dash_separated_floats(value): + vals = value.split("-") + for val in vals: + try: + float(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of floats" % value + ) + + return value + + +def inference( + args, + dlrm, + best_acc_test, + best_auc_test, + test_ld, + device, + use_gpu, + log_iter=-1, +): + test_accu = 0 + test_samp = 0 + + if args.mlperf_logging: + scores = [] + targets = [] + + if args.fb5logger is not None: + fb5logger = FB5Logger(args.fb5logger) + fb5logger.header("DLRM", "OOTB", "eval", args.fb5config, score_metric=loggerconstants.EXPS) + + for i, testBatch in enumerate(test_ld): + # early exit if nbatches was set by the user and was exceeded + if nbatches > 0 and i >= nbatches: + break + + if i == args.warmup_steps and args.fb5logger is not None: + fb5logger.run_start() + + X_test, lS_o_test, lS_i_test, T_test, W_test, CBPP_test = unpack_batch( + testBatch + ) + + # Skip the batch if batch size not multiple of total ranks + if ext_dist.my_size > 1 and X_test.size(0) % ext_dist.my_size != 0: + print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0))) + continue + + # forward pass + Z_test = dlrm_wrap( + X_test, + lS_o_test, + lS_i_test, + use_gpu, + device, + ndevices=ndevices, + ) + ### gather the distributed results on each rank ### + # For some reason it requires explicit sync before all_gather call if + # tensor is on GPU memory + if Z_test.is_cuda: + torch.cuda.synchronize() + (_, batch_split_lengths) = ext_dist.get_split_lengths(X_test.size(0)) + if ext_dist.my_size > 1: + Z_test = ext_dist.all_gather(Z_test, batch_split_lengths) + + if args.mlperf_logging: + S_test = Z_test.detach().cpu().numpy() # numpy array + T_test = T_test.detach().cpu().numpy() # numpy array + scores.append(S_test) + targets.append(T_test) + else: + with record_function("DLRM accuracy compute"): + # compute loss and accuracy + S_test = Z_test.detach().cpu().numpy() # numpy array + T_test = T_test.detach().cpu().numpy() # numpy array + + mbs_test = T_test.shape[0] # = mini_batch_size except last + A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8)) + + test_accu += A_test + test_samp += mbs_test + + if args.fb5logger is not None: + fb5logger.run_stop(nbatches - args.warmup_steps, args.mini_batch_size) + + if args.mlperf_logging: + with record_function("DLRM mlperf sklearn metrics compute"): + scores = np.concatenate(scores, axis=0) + targets = np.concatenate(targets, axis=0) + + metrics = { + "recall": lambda y_true, y_score: sklearn.metrics.recall_score( + y_true=y_true, y_pred=np.round(y_score) + ), + "precision": lambda y_true, y_score: sklearn.metrics.precision_score( + y_true=y_true, y_pred=np.round(y_score) + ), + "f1": lambda y_true, y_score: sklearn.metrics.f1_score( + y_true=y_true, y_pred=np.round(y_score) + ), + "ap": sklearn.metrics.average_precision_score, + "roc_auc": sklearn.metrics.roc_auc_score, + "accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score( + y_true=y_true, y_pred=np.round(y_score) + ), + } + + validation_results = {} + for metric_name, metric_function in metrics.items(): + validation_results[metric_name] = metric_function(targets, scores) + writer.add_scalar( + "mlperf-metrics-test/" + metric_name, + validation_results[metric_name], + log_iter, + ) + acc_test = validation_results["accuracy"] + else: + acc_test = test_accu / test_samp + writer.add_scalar("Test/Acc", acc_test, log_iter) + + model_metrics_dict = { + "nepochs": args.nepochs, + "nbatches": nbatches, + "nbatches_test": nbatches_test, + "state_dict": dlrm.state_dict(), + "test_acc": acc_test, + } + + if args.mlperf_logging: + is_best = validation_results["roc_auc"] > best_auc_test + if is_best: + best_auc_test = validation_results["roc_auc"] + model_metrics_dict["test_auc"] = best_auc_test + print( + "recall {:.4f}, precision {:.4f},".format( + validation_results["recall"], + validation_results["precision"], + ) + + " f1 {:.4f}, ap {:.4f},".format( + validation_results["f1"], validation_results["ap"] + ) + + " auc {:.4f}, best auc {:.4f},".format( + validation_results["roc_auc"], best_auc_test + ) + + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format( + validation_results["accuracy"] * 100, best_acc_test * 100 + ), + flush=True, + ) + else: + is_best = acc_test > best_acc_test + if is_best: + best_acc_test = acc_test + print( + " accuracy {:3.3f} %, best {:3.3f} %".format( + acc_test * 100, best_acc_test * 100 + ), + flush=True, + ) + return model_metrics_dict, is_best + + +def run(): + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Train Deep Learning Recommendation Model (DLRM)" + ) + # model related parameters + parser.add_argument("--arch-sparse-feature-size", type=int, default=2) + parser.add_argument( + "--arch-embedding-size", type=dash_separated_ints, default="4-3-2" + ) + # j will be replaced with the table number + parser.add_argument("--arch-mlp-bot", type=dash_separated_ints, default="4-3-2") + parser.add_argument("--arch-mlp-top", type=dash_separated_ints, default="4-2-1") + parser.add_argument( + "--arch-interaction-op", type=str, choices=["dot", "cat"], default="dot" + ) + parser.add_argument("--arch-interaction-itself", action="store_true", default=False) + parser.add_argument( + "--weighted-pooling", type=str, choices=["fixed", "learned", None], default=None + ) + + # embedding table options + parser.add_argument("--md-flag", action="store_true", default=False) + parser.add_argument("--md-threshold", type=int, default=200) + parser.add_argument("--md-temperature", type=float, default=0.3) + parser.add_argument("--md-round-dims", action="store_true", default=False) + parser.add_argument("--qr-flag", action="store_true", default=False) + parser.add_argument("--qr-threshold", type=int, default=200) + parser.add_argument("--qr-operation", type=str, default="mult") + parser.add_argument("--qr-collisions", type=int, default=4) + # activations and loss + parser.add_argument("--activation-function", type=str, default="relu") + parser.add_argument("--loss-function", type=str, default="mse") # or bce or wbce + parser.add_argument( + "--loss-weights", type=dash_separated_floats, default="1.0-1.0" + ) # for wbce + parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7 + parser.add_argument("--round-targets", type=bool, default=False) + # data + parser.add_argument("--data-size", type=int, default=1) + parser.add_argument("--num-batches", type=int, default=0) + parser.add_argument( + "--data-generation", type=str, default="random" + ) # synthetic or dataset + parser.add_argument( + "--rand-data-dist", type=str, default="uniform" + ) # uniform or gaussian + parser.add_argument("--rand-data-min", type=float, default=0) + parser.add_argument("--rand-data-max", type=float, default=1) + parser.add_argument("--rand-data-mu", type=float, default=-1) + parser.add_argument("--rand-data-sigma", type=float, default=1) + parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log") + parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + parser.add_argument("--data-randomize", type=str, default="total") # or day or none + parser.add_argument("--data-trace-enable-padding", type=bool, default=False) + parser.add_argument("--max-ind-range", type=int, default=-1) + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--num-indices-per-lookup", type=int, default=10) + parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False) + parser.add_argument("--num-workers", type=int, default=0) + parser.add_argument("--memory-map", action="store_true", default=False) + # training + parser.add_argument("--mini-batch-size", type=int, default=1) + parser.add_argument("--nepochs", type=int, default=1) + parser.add_argument("--learning-rate", type=float, default=0.01) + parser.add_argument("--print-precision", type=int, default=5) + parser.add_argument("--numpy-rand-seed", type=int, default=123) + parser.add_argument("--sync-dense-params", type=bool, default=True) + parser.add_argument("--optimizer", type=str, default="sgd") + parser.add_argument( + "--dataset-multiprocessing", + action="store_true", + default=False, + help="The Kaggle dataset can be multiprocessed in an environment \ + with more than 7 CPU cores and more than 20 GB of memory. \n \ + The Terabyte dataset can be multiprocessed in an environment \ + with more than 24 CPU cores and at least 1 TB of memory.", + ) + # inference + parser.add_argument("--inference-only", action="store_true", default=False) + # quantize + parser.add_argument("--quantize-mlp-with-bit", type=int, default=32) + parser.add_argument("--quantize-emb-with-bit", type=int, default=32) + # onnx + parser.add_argument("--save-onnx", action="store_true", default=False) + # gpu + parser.add_argument("--use-gpu", action="store_true", default=False) + parser.add_argument("--use-fbgemm-gpu", action="store_true", default=False) + parser.add_argument( + "--fbgemm-gpu-codegen-pref", + type=str, + choices=["Split", "IntN"], + default="Split", + ) + # torch2trt + parser.add_argument("--use-torch2trt-for-mlp", action="store_true", default=False) + # distributed + parser.add_argument("--local_rank", type=int, default=-1) + parser.add_argument("--dist-backend", type=str, default="") + # debugging and profiling + parser.add_argument("--print-freq", type=int, default=1) + parser.add_argument("--test-freq", type=int, default=-1) + parser.add_argument("--test-mini-batch-size", type=int, default=-1) + parser.add_argument("--test-num-workers", type=int, default=-1) + parser.add_argument("--print-time", action="store_true", default=False) + parser.add_argument("--print-wall-time", action="store_true", default=False) + parser.add_argument("--print-accumulated-time", action="store_true", default=False) + parser.add_argument("--debug-mode", action="store_true", default=False) + parser.add_argument("--enable-profiling", action="store_true", default=False) + parser.add_argument("--plot-compute-graph", action="store_true", default=False) + parser.add_argument("--tensor-board-filename", type=str, default="run_kaggle_pt") + # store/load model + parser.add_argument("--save-model", type=str, default="") + parser.add_argument("--load-model", type=str, default="") + # mlperf logging (disables other output and stops early) + parser.add_argument("--mlperf-logging", action="store_true", default=False) + # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107 + parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0) + # stop at target AUC Terabyte (no subsampling) 0.8025 + parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0) + parser.add_argument("--mlperf-bin-loader", action="store_true", default=False) + parser.add_argument("--mlperf-bin-shuffle", action="store_true", default=False) + # mlperf gradient accumulation iterations + parser.add_argument("--mlperf-grad-accum-iter", type=int, default=1) + # LR policy + parser.add_argument("--lr-num-warmup-steps", type=int, default=0) + parser.add_argument("--lr-decay-start-step", type=int, default=0) + parser.add_argument("--lr-num-decay-steps", type=int, default=0) + + parser.add_argument("--precache-ml-data", type=int, nargs='?', default=None, const=sys.maxsize) + parser.add_argument("--warmup-steps", type=int, default=0) + # FB5 Logging + parser.add_argument("--fb5logger", type=str, default=None) + parser.add_argument("--fb5config", type=str, default="tiny") + + global args + global nbatches + global nbatches_test + global writer + args = parser.parse_args() + + if args.dataset_multiprocessing: + assert float(sys.version[:3]) > 3.7, ( + "The dataset_multiprocessing " + + "flag is susceptible to a bug in Python 3.7 and under. " + + "https://github.com/facebookresearch/dlrm/issues/172" + ) + + if args.mlperf_logging: + mlperf_logger.log_event(key=mlperf_logger.constants.CACHE_CLEAR, value=True) + mlperf_logger.log_start( + key=mlperf_logger.constants.INIT_START, log_all_ranks=True + ) + + if args.weighted_pooling is not None: + if args.qr_flag: + sys.exit("ERROR: quotient remainder with weighted pooling is not supported") + if args.md_flag: + sys.exit("ERROR: mixed dimensions with weighted pooling is not supported") + if args.quantize_emb_with_bit in [4, 8]: + if args.qr_flag: + sys.exit( + "ERROR: 4 and 8-bit quantization with quotient remainder is not supported" + ) + if args.md_flag: + sys.exit( + "ERROR: 4 and 8-bit quantization with mixed dimensions is not supported" + ) + if args.quantize_emb_with_bit in [4, 8, 16] and ( + not fbgemm_gpu or not args.use_fbgemm_gpu + ): + extra_info = "" + if not fbgemm_gpu: + extra_info += "\nfbgemm_gpu module failed to import.\n\n" + fbgemm_gpu_import_error_msg + if not args.use_fbgemm_gpu: + extra_info += "--use-fbgemm-gpu not set. " + + if not args.inference_only: + sys.exit( + "ERROR: Training quantized embeddings requires fbgemm_gpu. " + + extra_info + ) + elif args.use_gpu: + sys.exit( + "ERROR: Quantized embeddings on GPU requires fbgemm_gpu. " + extra_info + ) + elif args.quantize_emb_with_bit == 16: + sys.exit( + "ERROR: 16-bit quantized embeddings requires fbgemm_gpu. " + extra_info + ) + + assert args.quantize_emb_with_bit in [ + 4, + 8, + 16, + 32, + ], "only support 4/8/16/32-bit but got {}".format(args.quantize_emb_with_bit) + + if args.use_gpu: + assert torch.cuda.is_available(), "No cuda device is available." + if args.use_fbgemm_gpu: + assert fbgemm_gpu, ("\nfbgemm_gpu module failed to import.\n\n" + fbgemm_gpu_import_error_msg) + use_gpu = args.use_gpu + use_fbgemm_gpu = args.use_fbgemm_gpu + + ### some basic setup ### + np.random.seed(args.numpy_rand_seed) + np.set_printoptions(precision=args.print_precision) + torch.set_printoptions(precision=args.print_precision) + torch.manual_seed(args.numpy_rand_seed) + + if args.test_mini_batch_size < 0: + # if the parameter is not set, use the training batch size + args.test_mini_batch_size = args.mini_batch_size + if args.test_num_workers < 0: + # if the parameter is not set, use the same parameter for training + args.test_num_workers = args.num_workers + + if not args.debug_mode: + ext_dist.init_distributed( + local_rank=args.local_rank, use_gpu=use_gpu, backend=args.dist_backend + ) + + if use_gpu: + torch.cuda.manual_seed_all(args.numpy_rand_seed) + torch.backends.cudnn.deterministic = True + if ext_dist.my_size > 1: + ngpus = 1 + device = torch.device("cuda", ext_dist.my_local_rank) + else: + ngpus = torch.cuda.device_count() + device = torch.device("cuda", 0) + print("Using {} GPU(s)...".format(ngpus)) + else: + device = torch.device("cpu") + print("Using CPU...") + + ### prepare training data ### + ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") + # input data + + if args.mlperf_logging: + mlperf_logger.barrier() + mlperf_logger.log_end(key=mlperf_logger.constants.INIT_STOP) + mlperf_logger.barrier() + mlperf_logger.log_start(key=mlperf_logger.constants.RUN_START) + mlperf_logger.barrier() + + if args.data_generation == "dataset": + train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args) + table_feature_map = {idx: idx for idx in range(len(train_data.counts))} + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + nbatches_test = len(test_ld) + + ln_emb = train_data.counts + # enforce maximum limit on number of vectors per embedding + if args.max_ind_range > 0: + ln_emb = np.array( + list( + map( + lambda x: x if x < args.max_ind_range else args.max_ind_range, + ln_emb, + ) + ) + ) + else: + ln_emb = np.array(ln_emb) + m_den = train_data.m_den + ln_bot[0] = m_den + else: + # input and target at random + ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") + m_den = ln_bot[0] + train_data, train_ld, test_data, test_ld = dp.make_random_data_and_loader( + args, ln_emb, m_den, cache_size=args.precache_ml_data + ) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + nbatches_test = len(test_ld) + + assert args.num_batches > args.warmup_steps, (f"Change --warmup-steps={args.warmup_steps} to be lower than --num-batches={args.num_batches}.") + + args.ln_emb = ln_emb.tolist() + if args.mlperf_logging: + print("command line args: ", json.dumps(vars(args))) + + ### parse command line arguments ### + m_spa = args.arch_sparse_feature_size + ln_emb = np.asarray(ln_emb) + num_fea = ln_emb.size + 1 # num sparse + num dense features + + if args.use_fbgemm_gpu: + assert m_spa % 4 == 0, ( + f"{m_spa} % 4 is not 0, but fbgemm_gpu requires the embedding dim " + + "(--arch-sparse-feature-size number) to be evenly divisible by 4." + ) + + m_den_out = ln_bot[ln_bot.size - 1] + if args.arch_interaction_op == "dot": + # approach 1: all + # num_int = num_fea * num_fea + m_den_out + # approach 2: unique + if args.arch_interaction_itself: + num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out + else: + num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out + elif args.arch_interaction_op == "cat": + num_int = num_fea * m_den_out + else: + sys.exit( + "ERROR: --arch-interaction-op=" + + args.arch_interaction_op + + " is not supported" + ) + arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top + ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-") + + # sanity check: feature sizes and mlp dimensions must match + if m_den != ln_bot[0]: + sys.exit( + "ERROR: arch-dense-feature-size " + + str(m_den) + + " does not match first dim of bottom mlp " + + str(ln_bot[0]) + ) + if args.qr_flag: + if args.qr_operation == "concat" and 2 * m_spa != m_den_out: + sys.exit( + "ERROR: 2 arch-sparse-feature-size " + + str(2 * m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + + " (note that the last dim of bottom mlp must be 2x the embedding dim)" + ) + if args.qr_operation != "concat" and m_spa != m_den_out: + sys.exit( + "ERROR: arch-sparse-feature-size " + + str(m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + ) + else: + if m_spa != m_den_out: + sys.exit( + "ERROR: arch-sparse-feature-size " + + str(m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + ) + if num_int != ln_top[0]: + sys.exit( + "ERROR: # of feature interactions " + + str(num_int) + + " does not match first dimension of top mlp " + + str(ln_top[0]) + ) + + # assign mixed dimensions if applicable + if args.md_flag: + m_spa = md_solver( + torch.tensor(ln_emb), + args.md_temperature, # alpha + d0=m_spa, + round_dim=args.md_round_dims, + ).tolist() + if use_fbgemm_gpu: + for m in m_spa: + assert m % 4 == 0, ( + "Found an incompatible embedding dim in m_spa. " + + f"{m} % 4 is not 0, but fbgemm_gpu requires the " + + "embedding dim to be evenly divisible by 4." + ) + + # test prints (model arch) + if args.debug_mode: + print("model arch:") + print( + "mlp top arch " + + str(ln_top.size - 1) + + " layers, with input to output dimensions:" + ) + print(ln_top) + print("# of interactions") + print(num_int) + print( + "mlp bot arch " + + str(ln_bot.size - 1) + + " layers, with input to output dimensions:" + ) + print(ln_bot) + print("# of features (sparse and dense)") + print(num_fea) + print("dense feature size") + print(m_den) + print("sparse feature size") + print(m_spa) + print( + "# of embeddings (= # of sparse features) " + + str(ln_emb.size) + + ", with dimensions " + + str(m_spa) + + "x:" + ) + print(ln_emb) + + print("data (inputs and targets):") + for j, inputBatch in enumerate(train_ld): + X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch) + + torch.set_printoptions(precision=4) + # early exit if nbatches was set by the user and has been exceeded + if nbatches > 0 and j >= nbatches: + break + print("mini-batch: %d" % j) + print(X.detach().cpu()) + # transform offsets to lengths when printing + print( + torch.IntTensor( + [ + np.diff( + S_o.detach().cpu().tolist() + list(lS_i[i].shape) + ).tolist() + for i, S_o in enumerate(lS_o) + ] + ) + ) + print([S_i.detach().cpu() for S_i in lS_i]) + print(T.detach().cpu()) + + global ndevices + ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1 + + ### construct the neural network specified above ### + # WARNING: to obtain exactly the same initialization for + # the weights we need to start from the same random seed. + # np.random.seed(args.numpy_rand_seed) + global dlrm + dlrm = DLRM_Net( + m_spa, + ln_emb, + ln_bot, + ln_top, + arch_interaction_op=args.arch_interaction_op, + arch_interaction_itself=args.arch_interaction_itself, + sigmoid_bot=-1, + sigmoid_top=ln_top.size - 2, + sync_dense_params=args.sync_dense_params, + loss_threshold=args.loss_threshold, + ndevices=ndevices, + qr_flag=args.qr_flag, + qr_operation=args.qr_operation, + qr_collisions=args.qr_collisions, + qr_threshold=args.qr_threshold, + md_flag=args.md_flag, + md_threshold=args.md_threshold, + weighted_pooling=args.weighted_pooling, + loss_function=args.loss_function, + learning_rate=args.learning_rate, + use_gpu=use_gpu, + use_fbgemm_gpu=use_fbgemm_gpu, + fbgemm_gpu_codegen_pref=args.fbgemm_gpu_codegen_pref, + inference_only=args.inference_only, + quantize_mlp_with_bit=args.quantize_mlp_with_bit, + quantize_emb_with_bit=args.quantize_emb_with_bit, + ) + + # test prints + if args.debug_mode: + print("initial parameters (weights and bias):") + dlrm.print_weights() + + # In dlrm.quantize_embedding called below, the torch quantize calls run + # on cpu tensors only. They cannot quantize tensors stored on the gpu. + # So quantization occurs on cpu tensors before transferring them to gpu if + # use_gpu is enabled. + if args.quantize_emb_with_bit != 32: + dlrm.quantize_embedding(args.quantize_emb_with_bit) + + if not args.inference_only: + assert args.quantize_mlp_with_bit == 32, ( + "Dynamic quantization for mlp requires " + + "--inference-only because training is not supported" + ) + else: + # Currently only INT8 and FP16 quantized types are supported for quantized MLP inference. + # By default we don't do the quantization: quantize_{mlp,emb}_with_bit == 32 (FP32) + assert args.quantize_mlp_with_bit in [ + 8, + 16, + 32, + ], "only support 8/16/32-bit but got {}".format(args.quantize_mlp_with_bit) + + if args.quantize_mlp_with_bit != 32: + assert not use_gpu, ( + "Cannot run dynamic quantization for mlp " + + "with --use-gpu enabled, because DynamicQuantizedLinear's " + + "forward call calls 'quantized::linear_dynamic', which cannot " + + "run with arguments from the 'CUDA' backend." + ) + if args.quantize_mlp_with_bit in [8]: + quantize_dtype = torch.qint8 + else: + quantize_dtype = torch.float16 + dlrm.top_l = torch.quantization.quantize_dynamic( + dlrm.top_l, {torch.nn.Linear}, quantize_dtype + ) + dlrm.bot_l = torch.quantization.quantize_dynamic( + dlrm.bot_l, {torch.nn.Linear}, quantize_dtype + ) + + # Prep work for embedding tables and model transfer: + # Handling single-cpu and single-gpu modes + # NOTE: This also handles dist-backend modes (CLI args --dist-backend=nccl, + # --dist-backend=ccl, and --dist-backend=mpi) because in these modes each + # process runs in single-gpu mode. For example, if 8 processes are launched + # running dlrm_s_pytorch.py with --dist-backend=nccl --use-gpu, each process + # will run in single-gpu mode, resulting in 8 gpus total running distributed + # training or distributed inference if --inference-only is enabled. + if dlrm.ndevices_available <= 1: + if use_fbgemm_gpu: + dlrm.fbgemm_emb_l = nn.ModuleList( + [ + fbgemm_gpu_emb_bag_wrapper( + device, + dlrm.emb_l if dlrm.emb_l else dlrm.emb_l_q, + dlrm.m_spa, + dlrm.quantize_bits, + dlrm.learning_rate, + dlrm.fbgemm_gpu_codegen_pref, + dlrm.requires_grad, + ) + ] + ) + if use_gpu: + dlrm = dlrm.to(device) + if dlrm.weighted_pooling == "fixed": + for k, w in enumerate(dlrm.v_W_l): + dlrm.v_W_l[k] = w.cuda() + else: + # Handing Multi-gpu mode + dlrm.bot_l = dlrm.bot_l.to(device) + dlrm.top_l = dlrm.top_l.to(device) + dlrm.prepare_parallel_model(ndevices) + + if args.use_torch2trt_for_mlp: + if torch2trt and use_gpu and args.inference_only and args.quantize_mlp_with_bit == 32: + bot_l_sample_input = torch.ones([1, ln_bot[0]], dtype=torch.float32).cuda() + top_l_sample_input = torch.ones([1, ln_top[0]], dtype=torch.float32).cuda() + dlrm.bot_l = torch2trt.torch2trt(dlrm.bot_l, (bot_l_sample_input,)) + dlrm.top_l = torch2trt.torch2trt(dlrm.top_l, (top_l_sample_input,)) + elif torch2trt is None: + sys.exit("\ntorch2trt module failed to import.\n\n" + torch2trt_import_error_msg) + else: + error_msg = "ERROR: When --use-torch2trt-for-mlp is enabled, " + if not use_gpu: + error_msg += "--use-gpu must be enabled, " + if not args.inference_only: + error_msg += "--inference-only must be enabled, " + if args.quantize_mlp_with_bit != 32: + error_msg += "--quantize-mlp-with-bit must be disabled. " + error_msg = error_msg[:-2] + "." + sys.exit(error_msg) + + # distribute data parallel mlps + if ext_dist.my_size > 1: + if use_gpu: + device_ids = [ext_dist.my_local_rank] + dlrm.bot_l = ext_dist.DDP(dlrm.bot_l, device_ids=device_ids) + dlrm.top_l = ext_dist.DDP(dlrm.top_l, device_ids=device_ids) + else: + dlrm.bot_l = ext_dist.DDP(dlrm.bot_l) + dlrm.top_l = ext_dist.DDP(dlrm.top_l) + + if not args.inference_only: + # specify the optimizer algorithm + opts = { + "sgd": torch.optim.SGD, + "rwsadagrad": RowWiseSparseAdagrad.RWSAdagrad, + "adagrad": apex.optimizers.FusedAdagrad + if apex + else torch.optim.Adagrad, + } + + parameters = ( + dlrm.parameters() + if ext_dist.my_size == 1 + else [ + { + "params": [ + p + for emb in ( + [e.fbgemm_gpu_emb_bag for e in dlrm.fbgemm_emb_l] + if use_fbgemm_gpu + else dlrm.emb_l_q + if dlrm.quantize_bits != 32 + else dlrm.emb_l + ) + for p in emb.parameters() + ], + "lr": args.learning_rate, + }, + # TODO check this lr setup + # bottom mlp has no data parallelism + # need to check how do we deal with top mlp + { + "params": dlrm.bot_l.parameters(), + "lr": args.learning_rate, + }, + { + "params": dlrm.top_l.parameters(), + "lr": args.learning_rate, + }, + ] + ) + optimizer = opts[args.optimizer](parameters, lr=args.learning_rate) + lr_scheduler = LRPolicyScheduler( + optimizer, + args.lr_num_warmup_steps, + args.lr_decay_start_step, + args.lr_num_decay_steps, + ) + + # Guarantee GPU setup has completed before training or inference starts. + if use_gpu: + torch.cuda.synchronize() + + ### main loop ### + + # training or inference + best_acc_test = 0 + best_auc_test = 0 + skip_upto_epoch = 0 + skip_upto_batch = 0 + total_time = 0 + total_loss = 0 + total_iter = 0 + total_samp = 0 + + if args.mlperf_logging: + mlperf_logger.mlperf_submission_log("dlrm") + mlperf_logger.log_event( + key=mlperf_logger.constants.SEED, value=args.numpy_rand_seed + ) + mlperf_logger.log_event( + key=mlperf_logger.constants.GLOBAL_BATCH_SIZE, value=args.mini_batch_size + ) + + # Load model is specified + if not (args.load_model == ""): + print("Loading saved model {}".format(args.load_model)) + if use_gpu: + if dlrm.ndevices_available > 1: + # NOTE: when targeting inference on multiple GPUs, + # load the model as is on CPU or GPU, with the move + # to multiple GPUs to be done in parallel_forward + ld_model = torch.load(args.load_model) + else: + # NOTE: when targeting inference on single GPU, + # note that the call to .to(device) has already happened + ld_model = torch.load( + args.load_model, + map_location=torch.device("cuda") + # map_location=lambda storage, loc: storage.cuda(0) + ) + else: + # when targeting inference on CPU + ld_model = torch.load(args.load_model, map_location=torch.device("cpu")) + dlrm.load_state_dict(ld_model["state_dict"]) + ld_j = ld_model["iter"] + ld_k = ld_model["epoch"] + ld_nepochs = ld_model["nepochs"] + ld_nbatches = ld_model["nbatches"] + ld_nbatches_test = ld_model["nbatches_test"] + ld_train_loss = ld_model["train_loss"] + ld_total_loss = ld_model["total_loss"] + if args.mlperf_logging: + ld_gAUC_test = ld_model["test_auc"] + ld_acc_test = ld_model["test_acc"] + if not args.inference_only: + optimizer.load_state_dict(ld_model["opt_state_dict"]) + best_acc_test = ld_acc_test + total_loss = ld_total_loss + skip_upto_epoch = ld_k # epochs + skip_upto_batch = ld_j # batches + else: + args.print_freq = ld_nbatches + args.test_freq = 0 + + print( + "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format( + ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test + ) + ) + print( + "Training state: loss = {:.6f}".format( + ld_train_loss, + ) + ) + if args.mlperf_logging: + print( + "Testing state: accuracy = {:3.3f} %, auc = {:.3f}".format( + ld_acc_test * 100, ld_gAUC_test + ) + ) + else: + print("Testing state: accuracy = {:3.3f} %".format(ld_acc_test * 100)) + + print("time/loss/accuracy (if enabled):") + + if args.mlperf_logging: + # LR is logged twice for now because of a compliance checker bug + mlperf_logger.log_event( + key=mlperf_logger.constants.OPT_BASE_LR, value=args.learning_rate + ) + mlperf_logger.log_event( + key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS, + value=args.lr_num_warmup_steps, + ) + + # use logging keys from the official HP table and not from the logging library + mlperf_logger.log_event( + key="sgd_opt_base_learning_rate", value=args.learning_rate + ) + mlperf_logger.log_event( + key="lr_decay_start_steps", value=args.lr_decay_start_step + ) + mlperf_logger.log_event( + key="sgd_opt_learning_rate_decay_steps", value=args.lr_num_decay_steps + ) + mlperf_logger.log_event(key="sgd_opt_learning_rate_decay_poly_power", value=2) + + tb_file = "./" + args.tensor_board_filename + writer = SummaryWriter(tb_file) + + # Pre-cache samples. + if args.precache_ml_data: + for _ in (test_ld if args.inference_only else train_ld): + pass + + ext_dist.barrier() + with torch.autograd.profiler.profile( + args.enable_profiling, use_cuda=use_gpu, record_shapes=True + ) as prof: + + if not args.inference_only: + + if args.fb5logger is not None: + fb5logger = FB5Logger(args.fb5logger) + fb5logger.header("DLRM", "OOTB", "train", args.fb5config, score_metric=loggerconstants.EXPS) + + k = 0 + while k < args.nepochs: + if args.mlperf_logging: + mlperf_logger.barrier() + mlperf_logger.log_start( + key=mlperf_logger.constants.BLOCK_START, + metadata={ + mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1), + mlperf_logger.constants.EPOCH_COUNT: 1, + }, + ) + mlperf_logger.barrier() + mlperf_logger.log_start( + key=mlperf_logger.constants.EPOCH_START, + metadata={mlperf_logger.constants.EPOCH_NUM: (k + 1)}, + ) + + if k < skip_upto_epoch: + continue + + if args.print_accumulated_time: + accum_time_begin = time_wrap(use_gpu) + + if args.mlperf_logging: + previous_iteration_time = None + + for j, inputBatch in enumerate(train_ld): + if j == 0 and args.save_onnx: + X_onnx, lS_o_onnx, lS_i_onnx, _, _, _ = unpack_batch(inputBatch) + + if j < skip_upto_batch: + continue + + if k == 0 and j == args.warmup_steps and args.fb5logger is not None: + fb5logger.run_start() + + X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch) + + if args.mlperf_logging: + current_time = time_wrap(use_gpu) + if previous_iteration_time: + iteration_time = current_time - previous_iteration_time + else: + iteration_time = 0 + previous_iteration_time = current_time + else: + t1 = time_wrap(use_gpu) + + # early exit if nbatches was set by the user and has been exceeded + if nbatches > 0 and j >= nbatches: + break + + # Skip the batch if batch size not multiple of total ranks + if ext_dist.my_size > 1 and X.size(0) % ext_dist.my_size != 0: + print( + "Warning: Skiping the batch %d with size %d" + % (j, X.size(0)) + ) + continue + + mbs = T.shape[0] # = args.mini_batch_size except maybe for last + + # forward pass + Z = dlrm_wrap( + X, + lS_o, + lS_i, + use_gpu, + device, + ndevices=ndevices, + ) + + if ext_dist.my_size > 1: + T = T[ext_dist.get_my_slice(mbs)] + W = W[ext_dist.get_my_slice(mbs)] + + # loss + E = loss_fn_wrap(Z, T, use_gpu, device) + + # compute loss and accuracy + L = E.detach().cpu().numpy() # numpy array + # training accuracy is not disabled + # S = Z.detach().cpu().numpy() # numpy array + # T = T.detach().cpu().numpy() # numpy array + + # # print("res: ", S) + + # # print("j, train: BCE", j, L) + + # mbs = T.shape[0] # = args.mini_batch_size except maybe for last + # A = np.sum((np.round(S, 0) == T).astype(np.uint8)) + + with record_function("DLRM backward"): + # Update optimizer parameters to train weights instantiated lazily in + # the parallel_forward call. + if dlrm.ndevices_available > 1 and dlrm.add_new_weights_to_params: + + # Pop any prior extra parameters. Priors may exist because + # self.parallel_model_is_not_prepared is set back to True + # when self.parallel_model_batch_size != batch_size. + # Search "self.parallel_model_batch_size != batch_size" in code. + if "lazy_params" in optimizer.param_groups[-1].keys(): + optimizer.param_groups.pop() + + # dlrm.v_W_l_l is a list of nn.ParameterLists, one ParameterList per gpu. + # Flatten the list of nn.ParameterList to one nn.ParameterList, + # and add it to the trainable params list. + lazy_params = nn.ParameterList() + if dlrm.weighted_pooling == "learned": + lazy_params.extend( + nn.ParameterList( + [p for p_l in dlrm.v_W_l_l for p in p_l] + ) + ) + if dlrm.use_fbgemm_gpu: + lazy_params.extend( + nn.ParameterList( + [ + emb + for emb_ in dlrm.fbgemm_emb_l + for emb in emb_.fbgemm_gpu_emb_bag.parameters() + ] + ) + ) + lazy_params_dict = optimizer.param_groups[0] + lazy_params_dict["lazy_params"] = True + lazy_params_dict["params"] = lazy_params + optimizer.param_groups.append(lazy_params_dict) + dlrm.add_new_weights_to_params = False + # Run "[[t.device.type for t in grp['params']] for grp in optimizer.param_groups]" + # to view devices used by tensors in the param groups. + + # scaled error gradient propagation + # (where we do not accumulate gradients across mini-batches) + if ( + args.mlperf_logging + and (j + 1) % args.mlperf_grad_accum_iter == 0 + ) or not args.mlperf_logging: + optimizer.zero_grad() + # backward pass + E.backward() + + # optimizer + if ( + args.mlperf_logging + and (j + 1) % args.mlperf_grad_accum_iter == 0 + ) or not args.mlperf_logging: + optimizer.step() + lr_scheduler.step() + + if args.mlperf_logging: + total_time += iteration_time + else: + t2 = time_wrap(use_gpu) + total_time += t2 - t1 + + total_loss += L * mbs + total_iter += 1 + total_samp += mbs + + should_print = ((j + 1) % args.print_freq == 0) or ( + j + 1 == nbatches + ) + should_test = ( + (args.test_freq > 0) + and (args.data_generation in ["dataset", "random"]) + and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches)) + ) + + # print time, loss and accuracy + if should_print or should_test: + gT = 1000.0 * total_time / total_iter if args.print_time else -1 + total_time = 0 + + train_loss = total_loss / total_samp + total_loss = 0 + + str_run_type = ( + "inference" if args.inference_only else "training" + ) + + wall_time = "" + if args.print_wall_time: + wall_time = " ({})".format(time.strftime("%H:%M")) + + print( + "Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format( + str_run_type, j + 1, nbatches, k, gT + ) + + " loss {:.6f}".format(train_loss) + + wall_time, + flush=True, + ) + + if args.print_accumulated_time and ext_dist.my_rank < 2: + current_unix_time = time_wrap(use_gpu) + ext_dist.orig_print( + "Accumulated time so far: {} for process {} for step {} at {}".format( + current_unix_time - accum_time_begin, + ext_dist.my_rank, + j + 1, + current_unix_time, + ) + ) + + log_iter = nbatches * k + j + 1 + writer.add_scalar("Train/Loss", train_loss, log_iter) + + total_iter = 0 + total_samp = 0 + + # testing + if should_test: + epoch_num_float = (j + 1) / len(train_ld) + k + 1 + if args.mlperf_logging: + mlperf_logger.barrier() + mlperf_logger.log_start( + key=mlperf_logger.constants.EVAL_START, + metadata={ + mlperf_logger.constants.EPOCH_NUM: epoch_num_float + }, + ) + + # don't measure training iter time in a test iteration + if args.mlperf_logging: + previous_iteration_time = None + print( + "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k) + ) + model_metrics_dict, is_best = inference( + args, + dlrm, + best_acc_test, + best_auc_test, + test_ld, + device, + use_gpu, + log_iter, + ) + + if ( + is_best + and not (args.save_model == "") + and not args.inference_only + ): + model_metrics_dict["epoch"] = k + model_metrics_dict["iter"] = j + 1 + model_metrics_dict["train_loss"] = train_loss + model_metrics_dict["total_loss"] = total_loss + model_metrics_dict[ + "opt_state_dict" + ] = optimizer.state_dict() + print("Saving model to {}".format(args.save_model)) + torch.save(model_metrics_dict, args.save_model) + + if args.mlperf_logging: + mlperf_logger.barrier() + mlperf_logger.log_end( + key=mlperf_logger.constants.EVAL_STOP, + metadata={ + mlperf_logger.constants.EPOCH_NUM: epoch_num_float + }, + ) + + # Uncomment the line below to print out the total time with overhead + # print("Total test time for this group: {}" \ + # .format(time_wrap(use_gpu) - accum_test_time_begin)) + + if ( + args.mlperf_logging + and (args.mlperf_acc_threshold > 0) + and (best_acc_test > args.mlperf_acc_threshold) + ): + print( + "MLPerf testing accuracy threshold " + + str(args.mlperf_acc_threshold) + + " reached, stop training" + ) + break + + if ( + args.mlperf_logging + and (args.mlperf_auc_threshold > 0) + and (best_auc_test > args.mlperf_auc_threshold) + ): + print( + "MLPerf testing auc threshold " + + str(args.mlperf_auc_threshold) + + " reached, stop training" + ) + if args.mlperf_logging: + mlperf_logger.barrier() + mlperf_logger.log_end( + key=mlperf_logger.constants.RUN_STOP, + metadata={ + mlperf_logger.constants.STATUS: mlperf_logger.constants.SUCCESS + }, + ) + break + if k == 0 and args.fb5logger is not None: + fb5logger.run_stop(nbatches - args.warmup_steps, args.mini_batch_size) + + if args.mlperf_logging: + mlperf_logger.barrier() + mlperf_logger.log_end( + key=mlperf_logger.constants.EPOCH_STOP, + metadata={mlperf_logger.constants.EPOCH_NUM: (k + 1)}, + ) + mlperf_logger.barrier() + mlperf_logger.log_end( + key=mlperf_logger.constants.BLOCK_STOP, + metadata={mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1)}, + ) + k += 1 # nepochs + if args.mlperf_logging and best_auc_test <= args.mlperf_auc_threshold: + mlperf_logger.barrier() + mlperf_logger.log_end( + key=mlperf_logger.constants.RUN_STOP, + metadata={ + mlperf_logger.constants.STATUS: mlperf_logger.constants.ABORTED + }, + ) + else: + print("Testing for inference only") + inference( + args, + dlrm, + best_acc_test, + best_auc_test, + test_ld, + device, + use_gpu, + ) + + # profiling + if args.enable_profiling: + time_stamp = str(datetime.datetime.now()).replace(" ", "_") + with open("dlrm_s_pytorch" + time_stamp + "_shape.prof", "w") as prof_f: + prof_f.write( + prof.key_averages(group_by_input_shape=True).table( + sort_by="self_cpu_time_total" + ) + ) + with open("dlrm_s_pytorch" + time_stamp + "_total.prof", "w") as prof_f: + prof_f.write(prof.key_averages().table(sort_by="self_cpu_time_total")) + prof.export_chrome_trace("dlrm_s_pytorch" + time_stamp + ".json") + # print(prof.key_averages().table(sort_by="cpu_time_total")) + + # plot compute graph + if args.plot_compute_graph: + sys.exit( + "ERROR: Please install pytorchviz package in order to use the" + + " visualization. Then, uncomment its import above as well as" + + " three lines below and run the code again." + ) + # V = Z.mean() if args.inference_only else E + # dot = make_dot(V, params=dict(dlrm.named_parameters())) + # dot.render('dlrm_s_pytorch_graph') # write .pdf file + + # test prints + if not args.inference_only and args.debug_mode: + print("updated parameters (weights and bias):") + dlrm.print_weights() + + # export the model in onnx + if args.save_onnx: + """ + # workaround 1: tensor -> list + if torch.is_tensor(lS_i_onnx): + lS_i_onnx = [lS_i_onnx[j] for j in range(len(lS_i_onnx))] + # workaound 2: list -> tensor + lS_i_onnx = torch.stack(lS_i_onnx) + """ + # debug prints + # print("inputs", X_onnx, lS_o_onnx, lS_i_onnx) + # print("output", dlrm_wrap(X_onnx, lS_o_onnx, lS_i_onnx, use_gpu, device)) + dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx" + print("X_onnx.shape", X_onnx.shape) + if torch.is_tensor(lS_o_onnx): + print("lS_o_onnx.shape", lS_o_onnx.shape) + else: + for oo in lS_o_onnx: + print("oo.shape", oo.shape) + if torch.is_tensor(lS_i_onnx): + print("lS_i_onnx.shape", lS_i_onnx.shape) + else: + for ii in lS_i_onnx: + print("ii.shape", ii.shape) + + # name inputs and outputs + o_inputs = ( + ["offsets"] + if torch.is_tensor(lS_o_onnx) + else ["offsets_" + str(i) for i in range(len(lS_o_onnx))] + ) + i_inputs = ( + ["indices"] + if torch.is_tensor(lS_i_onnx) + else ["indices_" + str(i) for i in range(len(lS_i_onnx))] + ) + all_inputs = ["dense_x"] + o_inputs + i_inputs + # debug prints + print("inputs", all_inputs) + + # create dynamic_axis dictionaries + do_inputs = ( + [{"offsets": {1: "batch_size"}}] + if torch.is_tensor(lS_o_onnx) + else [ + {"offsets_" + str(i): {0: "batch_size"}} for i in range(len(lS_o_onnx)) + ] + ) + di_inputs = ( + [{"indices": {1: "batch_size"}}] + if torch.is_tensor(lS_i_onnx) + else [ + {"indices_" + str(i): {0: "batch_size"}} for i in range(len(lS_i_onnx)) + ] + ) + dynamic_axes = {"dense_x": {0: "batch_size"}, "pred": {0: "batch_size"}} + for do in do_inputs: + dynamic_axes.update(do) + for di in di_inputs: + dynamic_axes.update(di) + # debug prints + print(dynamic_axes) + # export model + torch.onnx.export( + dlrm, + (X_onnx, lS_o_onnx, lS_i_onnx), + dlrm_pytorch_onnx_file, + verbose=True, + use_external_data_format=True, + opset_version=11, + input_names=all_inputs, + output_names=["pred"], + dynamic_axes=dynamic_axes, + ) + # recover the model back + dlrm_pytorch_onnx = onnx.load("dlrm_s_pytorch.onnx") + # check the onnx model + onnx.checker.check_model(dlrm_pytorch_onnx) + total_time_end = time_wrap(use_gpu) + + +if __name__ == "__main__": + run() diff --git a/benchmarks/dlrm/ootb/extend_distributed.py b/benchmarks/dlrm/ootb/extend_distributed.py new file mode 100644 index 0000000..1f2c8a5 --- /dev/null +++ b/benchmarks/dlrm/ootb/extend_distributed.py @@ -0,0 +1,603 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +import builtins +import os +import sys + +import torch +import torch.distributed as dist +from torch.autograd import Function +from torch.autograd.profiler import record_function +from torch.nn.parallel import DistributedDataParallel as DDP + + +try: + import torch_ccl +except ImportError as e: + # print(e) + torch_ccl = False + +try: + import torch_ucc +except ImportError as e: + torch_ucc = False + + +my_rank = -1 +my_size = -1 +my_local_rank = -1 +my_local_size = -1 +alltoall_supported = False +a2a_impl = os.environ.get("DLRM_ALLTOALL_IMPL", "") + +myreq = None + + +def env2int(env_list, default=-1): + for e in env_list: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + + +def get_my_slice(n): + k, m = divmod(n, my_size) + return slice( + my_rank * k + min(my_rank, m), (my_rank + 1) * k + min(my_rank + 1, m), 1 + ) + + +def get_split_lengths(n): + k, m = divmod(n, my_size) + if m == 0: + splits = None + my_len = k + else: + splits = [(k + 1) if i < m else k for i in range(my_size)] + my_len = splits[my_rank] + return (my_len, splits) + + +def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""): + global myreq + global my_rank + global my_size + global my_local_rank + global my_local_size + global a2a_impl + global alltoall_supported + + # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2) + num_mpi_ranks = env2int( + ["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"] + ) + if backend == "" and num_mpi_ranks > 1: + if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0: + backend = "ccl" + elif use_gpu and dist.is_nccl_available(): + backend = "nccl" + elif dist.is_mpi_available(): + backend = "mpi" + else: + print( + "WARNING: MPI multi-process launch detected but PyTorch MPI backend not available." + ) + backend = "gloo" + + if backend != "": + # guess Rank and size + if rank == -1: + rank = env2int( + ["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0 + ) + if size == -1: + size = env2int( + [ + "PMI_SIZE", + "OMPI_COMM_WORLD_SIZE", + "MV2_COMM_WORLD_SIZE", + "WORLD_SIZE", + ], + 1, + ) + if not os.environ.get("RANK", None) and rank != -1: + os.environ["RANK"] = str(rank) + if not os.environ.get("WORLD_SIZE", None) and size != -1: + os.environ["WORLD_SIZE"] = str(size) + if not os.environ.get("MASTER_PORT", None): + os.environ["MASTER_PORT"] = "29500" + if not os.environ.get("MASTER_ADDR", None): + local_size = env2int( + [ + "MPI_LOCALNRANKS", + "OMPI_COMM_WORLD_LOCAL_SIZE", + "MV2_COMM_WORLD_LOCAL_SIZE", + ], + 1, + ) + if local_size != size and backend != "mpi": + print( + "Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default" + ) + print( + "If this run hangs, try exporting rank 0's hostname as MASTER_ADDR" + ) + os.environ["MASTER_ADDR"] = "127.0.0.1" + + if size > 1: + if local_rank == -1: + my_local_rank = env2int( + [ + "MPI_LOCALRANKID", + "OMPI_COMM_WORLD_LOCAL_RANK", + "MV2_COMM_WORLD_LOCAL_RANK", + "LOCAL_RANK", + ], + 0, + ) + else: + my_local_rank = local_rank + my_local_size = env2int( + [ + "MPI_LOCALNRANKS", + "OMPI_COMM_WORLD_LOCAL_SIZE", + "MV2_COMM_WORLD_LOCAL_SIZE", + ], + 1, + ) + if use_gpu: + if my_local_size > torch.cuda.device_count(): + print( + "Not sufficient GPUs available... local_size = %d, ngpus = %d" + % (my_local_size, torch.cuda.device_count()) + ) + sys.exit(1) + torch.cuda.set_device(my_local_rank) + dist.init_process_group(backend, rank=rank, world_size=size) + my_rank = dist.get_rank() + my_size = dist.get_world_size() + if my_rank == 0: + print("Running on %d ranks using %s backend" % (my_size, backend)) + if hasattr(dist, "all_to_all_single"): + try: + t = torch.zeros([4]) + if use_gpu: + t = t.cuda() + dist.all_to_all_single(t, t) + alltoall_supported = True + except RuntimeError as err: + print("fail to enable all_to_all_single primitive: %s" % err) + if a2a_impl == "alltoall" and alltoall_supported == False: + print( + "Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall" + % (a2a_impl, backend) + ) + a2a_impl = "scatter" + if a2a_impl != "": + print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl) + else: + my_rank = 0 + my_size = 1 + my_local_rank = 0 + my_local_size = 1 + print_all( + "world size: %d, current rank: %d, local rank: %d" + % (my_size, my_rank, my_local_rank) + ) + myreq = Request() + + +class Request(object): + def __init__(self): + self.req = None + self.tensor = None + self.WaitFunction = All2All_Scatter_Wait + + def wait(self): + ret = self.WaitFunction.apply(*self.tensor) + self.req = None + self.tensor = None + return ret + + +class All2All_ScatterList_Req(Function): + @staticmethod + def forward(ctx, a2a_info, *inputs): + global myreq + batch_split_lengths = ( + a2a_info.global_batch_partition_slices + if a2a_info.global_batch_partition_slices + else a2a_info.local_batch_num + ) + table_split_lengths = ( + a2a_info.global_table_wise_parition_slices + if a2a_info.global_table_wise_parition_slices + else [a2a_info.local_table_num] * my_size + ) + gather_list = [] + req_list = [] + for i in range(my_size): + for j in range(table_split_lengths[i]): + out_tensor = inputs[0].new_empty( + [a2a_info.local_batch_num, a2a_info.emb_dim] + ) + scatter_list = ( + list(inputs[j].split(batch_split_lengths, dim=0)) + if i == my_rank + else [] + ) + req = dist.scatter(out_tensor, scatter_list, src=i, async_op=True) + gather_list.append(out_tensor) + req_list.append(req) + myreq.req = req_list + myreq.tensor = tuple(gather_list) + myreq.a2a_info = a2a_info + return myreq.tensor + + @staticmethod + def backward(ctx, *grad_output): + global myreq + for r in myreq.req: + r.wait() + myreq.req = None + grad_inputs = myreq.tensor + myreq.tensor = None + return (None, *grad_inputs) + + +class All2All_ScatterList_Wait(Function): + @staticmethod + def forward(ctx, *output): + global myreq + ctx.a2a_info = myreq.a2a_info + for r in myreq.req: + r.wait() + myreq.req = None + myreq.tensor = None + return output + + @staticmethod + def backward(ctx, *grad_output): + global myreq + a2a_info = ctx.a2a_info + grad_output = [t.contiguous() for t in grad_output] + batch_split_lengths = ( + a2a_info.global_batch_partition_slices + if a2a_info.global_batch_partition_slices + else [a2a_info.local_batch_num] * my_size + ) + per_rank_table_splits = ( + a2a_info.global_table_wise_parition_slices + if a2a_info.global_table_wise_parition_slices + else [a2a_info.local_table_num] * my_size + ) + grad_inputs = [ + grad_output[0].new_empty([ctx.a2a_info.batch_size, ctx.a2a_info.emb_dim]) + for _ in range(a2a_info.local_table_num) + ] + req_list = [] + ind = 0 + for i in range(my_size): + for j in range(per_rank_table_splits[i]): + gather_list = ( + list(grad_inputs[j].split(batch_split_lengths, dim=0)) + if i == my_rank + else None + ) + req = dist.gather(grad_output[ind], gather_list, dst=i, async_op=True) + req_list.append(req) + ind += 1 + myreq.req = req_list + myreq.tensor = grad_inputs + return tuple(grad_output) + + +class All2All_Scatter_Req(Function): + @staticmethod + def forward(ctx, a2a_info, *inputs): + global myreq + batch_split_lengths = ( + a2a_info.global_batch_partition_slices + if a2a_info.global_batch_partition_slices + else a2a_info.local_batch_num + ) + table_split_lengths = ( + a2a_info.global_table_wise_parition_slices + if a2a_info.global_table_wise_parition_slices + else [a2a_info.local_table_num] * my_size + ) + input = torch.cat(inputs, dim=1) + scatter_list = list(input.split(batch_split_lengths, dim=0)) + gather_list = [] + req_list = [] + for i in range(my_size): + out_tensor = input.new_empty( + [a2a_info.local_batch_num, table_split_lengths[i] * a2a_info.emb_dim] + ) + req = dist.scatter( + out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True + ) + gather_list.append(out_tensor) + req_list.append(req) + myreq.req = req_list + myreq.tensor = tuple(gather_list) + myreq.a2a_info = a2a_info + ctx.a2a_info = a2a_info + return myreq.tensor + + @staticmethod + def backward(ctx, *grad_output): + global myreq + for r in myreq.req: + r.wait() + myreq.req = None + grad_input = myreq.tensor + grad_inputs = grad_input.split(ctx.a2a_info.emb_dim, dim=1) + myreq.tensor = None + return (None, *grad_inputs) + + +class All2All_Scatter_Wait(Function): + @staticmethod + def forward(ctx, *output): + global myreq + ctx.a2a_info = myreq.a2a_info + for r in myreq.req: + r.wait() + myreq.req = None + myreq.tensor = None + return output + + @staticmethod + def backward(ctx, *grad_output): + global myreq + assert len(grad_output) == my_size + scatter_list = [t.contiguous() for t in grad_output] + a2a_info = ctx.a2a_info + batch_split_lengths = ( + a2a_info.global_batch_partition_slices + if a2a_info.global_batch_partition_slices + else a2a_info.local_batch_num + ) + table_split_lengths = ( + a2a_info.global_table_wise_parition_slices + if a2a_info.global_table_wise_parition_slices + else [a2a_info.local_table_num] * my_size + ) + grad_input = grad_output[0].new_empty( + [a2a_info.batch_size, a2a_info.emb_dim * a2a_info.local_table_num] + ) + gather_list = list(grad_input.split(batch_split_lengths, dim=0)) + req_list = [] + for i in range(my_size): + req = dist.gather( + scatter_list[i], + gather_list if i == my_rank else [], + dst=i, + async_op=True, + ) + req_list.append(req) + myreq.req = req_list + myreq.tensor = grad_input + return grad_output + + +class All2All_Req(Function): + @staticmethod + def forward(ctx, a2a_info, *inputs): + global myreq + with record_function("DLRM alltoall_req_fwd_single"): + batch_split_lengths = a2a_info.global_batch_partition_slices + if batch_split_lengths: + batch_split_lengths = [ + m * a2a_info.emb_dim * a2a_info.local_table_num + for m in batch_split_lengths + ] + table_split_lengths = a2a_info.global_table_wise_parition_slices + if table_split_lengths: + table_split_lengths = [ + a2a_info.local_batch_num * e * a2a_info.emb_dim + for e in table_split_lengths + ] + input = torch.cat(inputs, dim=1).view([-1]) + output = input.new_empty( + [ + a2a_info.global_table_num + * a2a_info.local_batch_num + * a2a_info.emb_dim + ] + ) + req = dist.all_to_all_single( + output, input, table_split_lengths, batch_split_lengths, async_op=True + ) + + myreq.req = req + myreq.tensor = [] + myreq.tensor.append(output) + myreq.tensor = tuple(myreq.tensor) + a2a_info.batch_split_lengths = batch_split_lengths + a2a_info.table_split_lengths = table_split_lengths + myreq.a2a_info = a2a_info + ctx.a2a_info = a2a_info + return myreq.tensor + + @staticmethod + def backward(ctx, *grad_output): + global myreq + with record_function("DLRM alltoall_req_bwd_single"): + a2a_info = ctx.a2a_info + myreq.req.wait() + myreq.req = None + grad_input = myreq.tensor + grad_inputs = grad_input.view([a2a_info.batch_size, -1]).split( + a2a_info.emb_dim, dim=1 + ) + grad_inputs = [gin.contiguous() for gin in grad_inputs] + myreq.tensor = None + return (None, *grad_inputs) + + +class All2All_Wait(Function): + @staticmethod + def forward(ctx, *output): + global myreq + with record_function("DLRM alltoall_wait_fwd_single"): + a2a_info = myreq.a2a_info + ctx.a2a_info = a2a_info + myreq.req.wait() + myreq.req = None + myreq.tensor = None + table_split_lengths = ( + a2a_info.table_split_lengths + if a2a_info.table_split_lengths + else a2a_info.local_table_num + * a2a_info.local_batch_num + * a2a_info.emb_dim + ) + outputs = output[0].split(table_split_lengths) + outputs = tuple( + [out.view([a2a_info.local_batch_num, -1]) for out in outputs] + ) + return outputs + + @staticmethod + def backward(ctx, *grad_outputs): + global myreq + with record_function("DLRM alltoall_wait_bwd_single"): + a2a_info = ctx.a2a_info + grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs] + grad_output = torch.cat(grad_outputs) + grad_input = grad_output.new_empty( + [a2a_info.batch_size * a2a_info.local_table_num * a2a_info.emb_dim] + ) + req = dist.all_to_all_single( + grad_input, + grad_output, + a2a_info.batch_split_lengths, + a2a_info.table_split_lengths, + async_op=True, + ) + myreq.req = req + myreq.tensor = grad_input + return (grad_output,) + + +class AllGather(Function): + @staticmethod + def forward(ctx, input, global_lengths, dim=0): + if not isinstance(global_lengths, (list, tuple)): + global_lengths = [global_lengths] * my_size + + assert len(global_lengths) == my_size + assert global_lengths[my_rank] == input.size(dim) + local_start = sum(global_lengths[:my_rank]) + + output_size = list(input.size()) + + ctx.dim = dim + ctx.local_start = local_start + ctx.local_length = global_lengths[my_rank] + + input = input.contiguous() + if dim == 0: + out_len = sum(global_lengths) + output_size[dim] = out_len + output = input.new_empty(output_size) + gather_list = list(output.split(global_lengths, dim=0)) + else: + gather_list = [torch.empty_like(input) for _ in range(my_size)] + gather_list = [] + for length in global_lengths: + output_size[dim] = length + gather_list.append(input.new_empty(output_size)) + + dist.all_gather(gather_list, input) + + if dim != 0: + output = torch.cat(gather_list, dim=dim) + + return output + + @staticmethod + def backward(ctx, grad_output): + # print("Inside All2AllBackward") + dim = ctx.dim + start = ctx.local_start + length = ctx.local_length + + grad_input = grad_output.narrow(dim, start, length) + + return (grad_input, None, None) + + +class All2AllInfo(object): + pass + + +def alltoall(inputs, per_rank_table_splits): + global myreq + batch_size, emb_dim = inputs[0].size() + a2a_info = All2AllInfo() + a2a_info.local_table_num = len(inputs) + a2a_info.global_table_wise_parition_slices = per_rank_table_splits + ( + a2a_info.local_batch_num, + a2a_info.global_batch_partition_slices, + ) = get_split_lengths(batch_size) + a2a_info.emb_dim = emb_dim + a2a_info.batch_size = batch_size + a2a_info.global_table_num = ( + sum(per_rank_table_splits) + if per_rank_table_splits + else a2a_info.local_table_num * my_size + ) + + if a2a_impl == "" and alltoall_supported or a2a_impl == "alltoall": + # print("Using All2All_Req") + output = All2All_Req.apply(a2a_info, *inputs) + myreq.WaitFunction = All2All_Wait + elif a2a_impl == "" or a2a_impl == "scatter": + # print("Using All2All_Scatter_Req") + output = All2All_Scatter_Req.apply(a2a_info, *inputs) + myreq.WaitFunction = All2All_Scatter_Wait + elif a2a_impl == "scatter_list": + # print("Using All2All_ScatterList_Req") + output = All2All_ScatterList_Req.apply(a2a_info, *inputs) + myreq.WaitFunction = All2All_ScatterList_Wait + else: + print( + "Unknown value set for DLRM_ALLTOALL_IMPL (%s), " + "please use one of [alltoall, scatter, scatter_list]" % a2a_impl + ) + return myreq + + +def all_gather(input, lengths, dim=0): + if not lengths: + lengths = [input.size(0)] * my_size + return AllGather.apply(input, lengths, dim) + + +def barrier(): + if my_size > 1: + dist.barrier() + + +# Override builtin print function to print only from rank 0 +orig_print = builtins.print + + +def rank0_print(*args, **kwargs): + if my_rank <= 0 or kwargs.get("print_all", False): + orig_print(*args, **kwargs) + + +builtins.print = rank0_print + +# Allow printing from all rank with explicit print_all +def print_all(*args, **kwargs): + orig_print(*args, **kwargs) diff --git a/benchmarks/dlrm/ootb/input/dist_emb_0.log b/benchmarks/dlrm/ootb/input/dist_emb_0.log new file mode 100644 index 0000000..7a8c1b7 --- /dev/null +++ b/benchmarks/dlrm/ootb/input/dist_emb_0.log @@ -0,0 +1,3 @@ +1, 2, 3, 4, 5, 6 +0, 1, 3, 4, 5 +0.55, 0.64, 0.82, 0.91, 1.0 diff --git a/benchmarks/dlrm/ootb/input/dist_emb_1.log b/benchmarks/dlrm/ootb/input/dist_emb_1.log new file mode 100644 index 0000000..7a8c1b7 --- /dev/null +++ b/benchmarks/dlrm/ootb/input/dist_emb_1.log @@ -0,0 +1,3 @@ +1, 2, 3, 4, 5, 6 +0, 1, 3, 4, 5 +0.55, 0.64, 0.82, 0.91, 1.0 diff --git a/benchmarks/dlrm/ootb/input/dist_emb_2.log b/benchmarks/dlrm/ootb/input/dist_emb_2.log new file mode 100644 index 0000000..7a8c1b7 --- /dev/null +++ b/benchmarks/dlrm/ootb/input/dist_emb_2.log @@ -0,0 +1,3 @@ +1, 2, 3, 4, 5, 6 +0, 1, 3, 4, 5 +0.55, 0.64, 0.82, 0.91, 1.0 diff --git a/benchmarks/dlrm/ootb/input/trace.log b/benchmarks/dlrm/ootb/input/trace.log new file mode 100644 index 0000000..4d33e55 --- /dev/null +++ b/benchmarks/dlrm/ootb/input/trace.log @@ -0,0 +1 @@ +1, 2, 3, 4, 5, 3, 4, 1, 1, 6, 3 diff --git a/benchmarks/dlrm/ootb/kaggle_dac_loss_accuracy_plots.png b/benchmarks/dlrm/ootb/kaggle_dac_loss_accuracy_plots.png new file mode 100644 index 0000000..aaa51f3 Binary files /dev/null and b/benchmarks/dlrm/ootb/kaggle_dac_loss_accuracy_plots.png differ diff --git a/benchmarks/dlrm/ootb/mlperf_logger.py b/benchmarks/dlrm/ootb/mlperf_logger.py new file mode 100644 index 0000000..efce1d3 --- /dev/null +++ b/benchmarks/dlrm/ootb/mlperf_logger.py @@ -0,0 +1,118 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +""" +Utilities for MLPerf logging +""" +import os +import torch + +try: + from mlperf_logging import mllog + from mlperf_logging.mllog import constants + _MLLOGGER = mllog.get_mllogger() +except ImportError as error: + print("Unable to import mlperf_logging, ", error) + + +def log_start(*args, **kwargs): + "log with start tag" + _log_print(_MLLOGGER.start, *args, **kwargs) + + +def log_end(*args, **kwargs): + "log with end tag" + _log_print(_MLLOGGER.end, *args, **kwargs) + + +def log_event(*args, **kwargs): + "log with event tag" + _log_print(_MLLOGGER.event, *args, **kwargs) + + +def _log_print(logger, *args, **kwargs): + "makes mlperf logger aware of distributed execution" + if 'stack_offset' not in kwargs: + kwargs['stack_offset'] = 3 + if 'value' not in kwargs: + kwargs['value'] = None + + if kwargs.pop('log_all_ranks', False): + log = True + else: + log = (get_rank() == 0) + + if log: + logger(*args, **kwargs) + + +def config_logger(benchmark): + "initiates mlperf logger" + mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log')) + _MLLOGGER.logger.propagate = False + + +def barrier(): + """ + Works as a temporary distributed barrier, currently pytorch + doesn't implement barrier for NCCL backend. + Calls all_reduce on dummy tensor and synchronizes with GPU. + """ + if torch.distributed.is_available() and torch.distributed.is_initialized(): + torch.distributed.all_reduce(torch.cuda.FloatTensor(1)) + torch.cuda.synchronize() + + +def get_rank(): + """ + Gets distributed rank or returns zero if distributed is not initialized. + """ + if torch.distributed.is_available() and torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + else: + rank = 0 + return rank + + +def mlperf_submission_log(benchmark): + """ + Logs information needed for MLPerf submission + """ + + config_logger(benchmark) + + log_event( + key=constants.SUBMISSION_BENCHMARK, + value=benchmark, + ) + + log_event( + key=constants.SUBMISSION_ORG, + value='reference_implementation') + + log_event( + key=constants.SUBMISSION_DIVISION, + value='closed') + + log_event( + key=constants.SUBMISSION_STATUS, + value='onprem') + + log_event( + key=constants.SUBMISSION_PLATFORM, + value='reference_implementation') + + log_event( + key=constants.SUBMISSION_ENTRY, + value="reference_implementation") + + log_event( + key=constants.SUBMISSION_POC_NAME, + value='reference_implementation') + + log_event( + key=constants.SUBMISSION_POC_EMAIL, + value='reference_implementation') diff --git a/benchmarks/dlrm/ootb/optim/rwsadagrad.py b/benchmarks/dlrm/ootb/optim/rwsadagrad.py new file mode 100644 index 0000000..95381ec --- /dev/null +++ b/benchmarks/dlrm/ootb/optim/rwsadagrad.py @@ -0,0 +1,122 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +from torch.optim import Optimizer + + +class RWSAdagrad(Optimizer): + """Implements Row Wise Sparse Adagrad algorithm. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-2) + lr_decay (float, optional): learning rate decay (default: 0) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-10) + + """ + + def __init__(self, params, lr=1e-2, lr_decay=0.0, weight_decay=0.0, initial_accumulator_value=0.0, eps=1e-10): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= lr_decay: + raise ValueError("Invalid lr_decay value: {}".format(lr_decay)) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + if not 0.0 <= initial_accumulator_value: + raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + + self.defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay, + initial_accumulator_value=initial_accumulator_value) + super(RWSAdagrad, self).__init__(params, self.defaults) + + self.momentum_initialized = False + + for group in self.param_groups: + for p in group['params']: + self.state[p]['step'] = 0 + + def share_memory(self): + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + if p.grad.data.is_sparse: + state['momentum'].share_memory_() + else: + state['sum'].share_memory_() + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + if not self.momentum_initialized : + if p.grad.data.is_sparse: + self.state[p]['momentum'] = torch.full( + [p.data.shape[0]], + self.defaults["initial_accumulator_value"], + dtype=torch.float32, + ) + else: + self.state[p]['sum'] = torch.full_like(p.data, + self.defaults["initial_accumulator_value"], + dtype=torch.float32, + ) + + grad = p.grad + state = self.state[p] + + state['step'] += 1 + + if group['weight_decay'] != 0: + if p.grad.data.is_sparse: + raise RuntimeError("weight_decay option is not compatible with sparse gradients") + grad = grad.add(group['weight_decay'], p.data) + + clr = group['lr'] / (1.0 + (state['step'] - 1.0) * group['lr_decay']) + + if grad.is_sparse: + grad = grad.coalesce() # the update is non-linear so indices must be unique + grad_indices = grad._indices() + grad_values = grad._values() + size = grad.size() + + def make_sparse(values, row_wise): + constructor = grad.new + matrix_size = [size[0]] if row_wise else size + return constructor(grad_indices, values, matrix_size) + + if grad_values.numel() > 0: + momentum_update = make_sparse(grad_values.pow(2).mean(dim=1), True) + state['momentum'].add_(momentum_update) # update momentum + std = state['momentum'].sparse_mask(momentum_update.coalesce()) + std_values = std._values().sqrt_().add_(group['eps']) + p.data.add_(make_sparse(grad_values / std_values.view(std_values.size()[0], 1), False), alpha=-clr) + + else: + state['sum'].addcmul_(grad, grad, value=1.0) + std = state['sum'].sqrt().add_(group['eps']) + p.data.addcdiv_(grad, std, value=-clr) + + self.momentum_initialized = True + + return loss diff --git a/benchmarks/dlrm/ootb/requirements.txt b/benchmarks/dlrm/ootb/requirements.txt new file mode 100644 index 0000000..b198a12 --- /dev/null +++ b/benchmarks/dlrm/ootb/requirements.txt @@ -0,0 +1,8 @@ +future +numpy +onnx +pydot +torch +torchviz +scikit-learn +tqdm diff --git a/benchmarks/dlrm/ootb/test/dlrm_s_test.sh b/benchmarks/dlrm/ootb/test/dlrm_s_test.sh new file mode 100755 index 0000000..e504545 --- /dev/null +++ b/benchmarks/dlrm/ootb/test/dlrm_s_test.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +#WARNING: must have compiled PyTorch and caffe2 + +#check if extra argument is passed to the test +if [[ $# == 1 ]]; then + dlrm_extra_option=$1 +else + dlrm_extra_option="" +fi +#echo $dlrm_extra_option + +dlrm_py="python dlrm_s_pytorch.py" +dlrm_c2="python dlrm_s_caffe2.py" + +echo "Running commands ..." +#run pytorch +echo $dlrm_py +$dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1 +$dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2 +$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3 +$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4 + +#run caffe2 +echo $dlrm_c2 +$dlrm_c2 --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc1 +$dlrm_c2 --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc2 +$dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc3 +$dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc4 + +echo "Checking results ..." +#check results +#WARNING: correct test will have no difference in numeric values +#(but might have some verbal difference, e.g. due to warnnings) +#in the output file +echo "diff test1 (no numeric values in the output = SUCCESS)" +diff ccc1 ppp1 +echo "diff test2 (no numeric values in the output = SUCCESS)" +diff ccc2 ppp2 +echo "diff test3 (no numeric values in the output = SUCCESS)" +diff ccc3 ppp3 +echo "diff test4 (no numeric values in the output = SUCCESS)" +diff ccc4 ppp4 diff --git a/benchmarks/dlrm/ootb/test/dlrm_s_test_fbgemm_gpu.sh b/benchmarks/dlrm/ootb/test/dlrm_s_test_fbgemm_gpu.sh new file mode 100644 index 0000000..c699043 --- /dev/null +++ b/benchmarks/dlrm/ootb/test/dlrm_s_test_fbgemm_gpu.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +#WARNING: must have fbgemm_gpu module to run these tests. + +echo -e "\nConsistency test: fbgemm_gpu -compared-with- PyTorch emb ops" +dlrm_base_config_="python dlrm_s_pytorch.py --arch-sparse-feature-size=172 --arch-mlp-bot=1559-2500-2500-172 --arch-mlp-top=2000-2000-2000-1 --arch-embedding-size=213728-213728-213728-213728-213728-213728-213728-213728 --mini-batch-size=64 --num-indices-per-lookup-fixed=1 --num-indices-per-lookup=16 --num-batches=1 --nepochs=3 --debug-mode" + +for weighted_pooling in '' ' --weighted-pooling=fixed' ' --weighted-pooling=learned'; +do + dlrm_base_config=$dlrm_base_config_$weighted_pooling + + echo -e "\n======================================================" + echo "Testing 32-bit embeddings" + + dlrm_config="$dlrm_base_config" + echo "---GROUND TRUTH--- using PyTorch emb ops on CPU" + echo "$dlrm_config" + $dlrm_config > aaa1 + echo "---COMPARISON--- using fbgemm_gpu on CPU" + echo "$dlrm_config --use-fbgemm-gpu" + $dlrm_config --use-fbgemm-gpu > aaa2 + echo "diff GT & COMP (no numeric values in the output = SUCCESS)" + diff aaa1 aaa2 + + echo "---GROUND TRUTH--- using PyTorch emb ops on GPU" + echo "$dlrm_config --use-gpu" + $dlrm_config --use-gpu > bbb1 + echo "---COMPARISON--- using fbgemm_gpu on GPU" + echo "$dlrm_config --use-gpu --use-fbgemm-gpu" + $dlrm_config --use-fbgemm-gpu --use-gpu > bbb2 + echo "diff GT & COMP (no numeric values in the output = SUCCESS)" + diff bbb1 bbb2 + + echo -e "\n======================================================" + echo "Testing 8-bit quantized embeddings, inference only" + dlrm_config="$dlrm_base_config --inference-only --quantize-emb-with-bit=8" + + echo "---GROUND TRUTH--- using PyTorch emb ops on CPU" + echo "$dlrm_config" + $dlrm_config > ccc1 + + echo "---COMPARISON--- using fbgemm_gpu on CPU" + echo "$dlrm_config --use-fbgemm-gpu" + $dlrm_config --use-fbgemm-gpu > ccc2 + echo "diff GT & COMP (no numeric values in the output = SUCCESS)" + diff ccc1 ccc2 +done diff --git a/benchmarks/dlrm/ootb/tools/visualize.py b/benchmarks/dlrm/ootb/tools/visualize.py new file mode 100755 index 0000000..f16504c --- /dev/null +++ b/benchmarks/dlrm/ootb/tools/visualize.py @@ -0,0 +1,1030 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# +# This script performs the visualization of the embedding tables created in +# DLRM during the training procedure. We use two popular techniques for +# visualization: umap (https://umap-learn.readthedocs.io/en/latest/) and +# tsne (https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html). +# These links also provide instructions on how to install these packages +# in different environments. +# +# Warning: the size of the data to be visualized depends on the RAM on your machine. +# +# +# Connand line examples: +# +# Full analysis of embeddings and data representations for Criteo Kaggle data: +# $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 +# --raw-data-file=../../criteo/input/train.txt --skip-categorical-analysis +# --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz +# +# +# To run just the analysis of categoricala data for Criteo Kaggle data set: +# $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 \ +# --raw-data-file=../../criteo/input/train.txt --data-randomize=none --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz \ +# --skip-embedding --skip-data-plots +# +# +# The following command line arguments are available to the user: +# +# --load-model - DLRM model file +# --data-set - one of ["kaggle", "terabyte"] +# --max-ind-range - max index range used during the traning +# --output-dir - output directory, if not specified, it will be traeted from the model and datset names +# --max-umap-size - max number of points to visualize using UMAP, default=50000 +# --use-tsne - use T-SNE +# --max-tsne-size - max number of points to visualize using T-SNE, default=1000) +# --skip-embedding - skips analysis of embedding tables +# --umap-metric - metric for UMAP +# --skip-data-plots - skips data plots +# --skip-categorical-analysis - skips categorical analysis +# +# # data file related +# --raw-data-file +# --processed-data-file +# --data-sub-sample-rate +# --data-randomize +# --memory-map +# --mini-batch-size +# --num-workers +# --test-mini-batch-size +# --test-num-workers +# --num-batches +# --mlperf-logging + +import os +import sys +import argparse +import numpy as np +import umap +import hdbscan +import json +import torch +import math +import matplotlib +import matplotlib.pyplot as plt +import collections + +from sklearn.metrics import accuracy_score +from sklearn.metrics import f1_score +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score + +from sklearn import manifold + +import dlrm_data_pytorch as dp +from dlrm_s_pytorch import DLRM_Net + + +def visualize_embeddings_umap(emb_l, + output_dir = "", + max_size = 500000, + umap_metric = "euclidean", + cat_counts = None, + use_max_count = True): + + for k in range(0, len(emb_l)): + + E = emb_l[k].weight.detach().cpu().numpy() + print("umap", E.shape) + + # create histogram of norms + bins = 50 + norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])] +# plt.hist(norms, bins = bins) +# plt.title("Cat norm hist var. "+str(k)) + hist, bins = np.histogram(norms, bins=bins) + logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),len(bins)) + + plt.figure(figsize=(8,8)) + plt.title("Categorical norms: " + str(k) + " cardinality " + str(len(cat_counts[k]))) + plt.hist(norms, bins=logbins) + plt.xscale("log") +# plt.legend() + plt.savefig(output_dir+"/cat-norm-histogram-"+str(k)+".png") + plt.close() + + if E.shape[0] < 20: + print("Skipping small embedding") + continue + + n_vis = min(max_size, E.shape[0]) + min_cnt = 0 + +# reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1) + reducer = umap.UMAP(random_state=42, metric=umap_metric) + + if use_max_count is False or n_vis == E.shape[0]: + Y = reducer.fit_transform(E[:n_vis,:]) + else: + + # select values with couns > 1 + done = False + min_cnt = 1 + while done == False: + el_cnt = (cat_counts[k] > min_cnt).sum() + if el_cnt <= max_size: + done = True + else: + min_cnt = min_cnt+1 + + E1= [] + for i in range(0, E.shape[0]): + if cat_counts[k][i] > min_cnt: + E1.append(E[i,:]) + + print("max_count_len", len(E1), "mincount", min_cnt) + Y = reducer.fit_transform(np.array(E1)) + + n_vis = len(E1) + + plt.figure(figsize=(8,8)) + + linewidth = 0 + size = 1 + + if Y.shape[0] < 2500: + linewidth = 1 + size = 5 + + if cat_counts is None: + plt.scatter(-Y[:,0], -Y[:,1], s=size, marker=".", linewidth=linewidth) + else: + #print(cat_counts[k]) + n_disp = min(len(cat_counts[k]), Y.shape[0]) + cur_max = math.log(max(cat_counts[k])) + norm_cat_count = [math.log(cat_counts[k][i]+1)/cur_max for i in range(0, len(cat_counts[k]))] + plt.scatter(-Y[0:n_disp,0], -Y[0:n_disp,1], s=size, marker=".", linewidth=linewidth, c=np.array(norm_cat_count)[0:n_disp], cmap="viridis") + plt.colorbar() + + plt.title("UMAP: categorical var. " + str(k) + " (" + str(n_vis) + " of " + str(E.shape[0]) + ", min count " + str(min_cnt) + ")") + plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-umap.png") + plt.close() + + +def visualize_embeddings_tsne(emb_l, + output_dir = "", + max_size = 10000): + + for k in range(0, len(emb_l)): + + E = emb_l[k].weight.detach().cpu() + print("tsne", E.shape) + + if E.shape[0] < 20: + print("Skipping small embedding") + continue + + n_vis = min(max_size, E.shape[0]) + + tsne = manifold.TSNE(init="pca", random_state=0, method="exact") + + Y = tsne.fit_transform(E[:n_vis,:]) + + plt.figure(figsize=(8, 8)) + + linewidth = 0 + if Y.shape[0] < 5000: + linewidth = 1 + + plt.scatter(-Y[:,0], -Y[:,1], s=1, marker=".", linewidth=linewidth) + + plt.title("TSNE: categorical var. " + str(k) + " (" + str(n_vis) + " of " + str(E.shape[0]) + ")") + plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-tsne.png") + plt.close() + + +def analyse_categorical_data(X_cat, n_days=10, output_dir=""): + + # analyse categorical variables + n_vec = len(X_cat) + n_cat = len(X_cat[0]) + n_days = n_days + + print("n_vec", n_vec, "n_cat", n_cat) +# for c in train_data.X_cat: +# print(n_cat, c) + + all_cat = np.array(X_cat) + print("all_cat.shape", all_cat.shape) + day_size = all_cat.shape[0]/n_days + + for i in range(0,n_cat): + l_d = [] + l_s1 = [] + l_s2 = [] + l_int = [] + l_rem = [] + + cat = all_cat[:,i] + print("cat", i, cat.shape) + for d in range(1,n_days): + offset = int(d*day_size) + #print(offset) + cat1 = cat[:offset] + cat2 = cat[offset:] + + s1 = set(cat1) + s2 = set(cat2) + + intersect = list(s1 & s2) + #print(intersect) + l_d.append(d) + l_s1.append(len(s1)) + l_s2.append(len(s2)) + l_int.append(len(intersect)) + l_rem.append((len(s1)-len(intersect))) + + print(d, ",", len(s1), ",", len(s2), ",", len(intersect), ",", (len(s1)-len(intersect))) + + print("spit", l_d) + print("before", l_s1) + print("after", l_s2) + print("inters.", l_int) + print("removed", l_rem) + + plt.figure(figsize=(8,8)) + plt.plot(l_d, l_s1, "g", label="before") + plt.plot(l_d, l_s2, "r", label="after") + plt.plot(l_d, l_int, "b", label="intersect") + plt.plot(l_d, l_rem, "y", label="removed") + plt.title("categorical var. "+str(i)) + plt.legend() + plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png") + plt.close() + + +def analyse_categorical_counts(X_cat, emb_l=None, output_dir=""): + + # analyse categorical variables + n_vec = len(X_cat) + n_cat = len(X_cat[0]) + + print("n_vec", n_vec, "n_cat", n_cat) +# for c in train_data.X_cat: +# print(n_cat, c) + + all_cat = np.array(X_cat) + print("all_cat.shape", all_cat.shape) + + all_counts = [] + + for i in range(0,n_cat): + + cat = all_cat[:,i] + if emb_l is None: + s = set(cat) + counts = np.zeros((len(s))) + print("cat", i, cat.shape, len(s)) + else: + s = emb_l[i].weight.detach().cpu().shape[0] + counts = np.zeros((s)) + print("cat", i, cat.shape, s) + + for d in range(0,n_vec): + cv = int(cat[d]) + counts[cv] = counts[cv]+1 + + all_counts.append(counts) + + if emb_l is None: + plt.figure(figsize=(8,8)) + plt.plot(counts) + plt.title("Categorical var "+str(i) + " cardinality " + str(len(counts))) + # plt.legend() + else: + E = emb_l[i].weight.detach().cpu().numpy() + norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])] + + fig, (ax0, ax1) = plt.subplots(2, 1) + fig.suptitle("Categorical variable: " + str(i)+" cardinality "+str(len(counts))) + + ax0.plot(counts) + ax0.set_yscale("log") + ax0.set_title("Counts", fontsize=10) + + ax1.plot(norms) + ax1.set_title("Norms", fontsize=10) + + plt.savefig(output_dir+"/cat_counts-"+str(i).zfill(3)+".png") + plt.close() + + return all_counts + + +def dlrm_output_wrap(dlrm, X, lS_o, lS_i, T): + + all_feat_vec = [] + all_cat_vec = [] + x_vec = None + t_out = None + c_out = None + z_out = [] + p_out = None + + z_size = len(dlrm.top_l) + + x = dlrm.apply_mlp(X, dlrm.bot_l) + # debug prints + #print("intermediate") + #print(x[0].detach().cpu().numpy()) + x_vec = x[0].detach().cpu().numpy() + all_feat_vec.append(x_vec) +# all_X.append(x[0].detach().cpu().numpy()) + + # process sparse features(using embeddings), resulting in a list of row vectors + ly = dlrm.apply_emb(lS_o, lS_i, dlrm.emb_l) + + for e in ly: + #print(e.detach().cpu().numpy()) + all_feat_vec.append(e[0].detach().cpu().numpy()) + all_cat_vec.append(e[0].detach().cpu().numpy()) + + all_feat_vec= np.concatenate(all_feat_vec, axis=0) + all_cat_vec= np.concatenate(all_cat_vec, axis=0) + +# all_features.append(all_feat_vec) +# all_cat.append(all_cat_vec) + t_out = int(T.detach().cpu().numpy()[0,0]) +# all_T.append(int(T.detach().cpu().numpy()[0,0])) + + z = dlrm.interact_features(x, ly) + # print(z.detach().cpu().numpy()) +# z_out = z.detach().cpu().numpy().flatten() + z_out.append(z.detach().cpu().numpy().flatten()) +# all_z[0].append(z.detach().cpu().numpy().flatten()) + + # obtain probability of a click (using top mlp) +# print(dlrm.top_l) +# p = dlrm.apply_mlp(z, dlrm.top_l) + + for i in range(0, z_size): + z = dlrm.top_l[i](z) + +# if i < z_size-1: +# curr_z = z.detach().cpu().numpy().flatten() + z_out.append(z.detach().cpu().numpy().flatten()) +# all_z[i+1].append(curr_z) +# print("z append", i) + +# print("z",i, z.detach().cpu().numpy().flatten().shape) + + p = z + + # clamp output if needed + if 0.0 < dlrm.loss_threshold and dlrm.loss_threshold < 1.0: + z = torch.clamp(p, min=dlrm.loss_threshold, max=(1.0 - dlrm.loss_threshold)) + else: + z = p + + class_thresh = 0.0 #-0.25 + zp = z.detach().cpu().numpy()[0,0]+ class_thresh + + p_out = int(zp+0.5) + if p_out > 1: + p_out = 1 + if p_out < 0: + p_out = 0 + +# all_pred.append(int(z.detach().cpu().numpy()[0,0]+0.5)) + + #print(int(z.detach().cpu().numpy()[0,0]+0.5)) + if int(p_out) == t_out: + c_out = 0 + else: + c_out = 1 + + return all_feat_vec, x_vec, all_cat_vec, t_out, c_out, z_out, p_out + + +def create_umap_data(dlrm, data_ld, max_size=50000, offset=0, info=""): + + all_features = [] + all_X = [] + all_cat = [] + all_T = [] + all_c = [] + all_z = [] + all_pred = [] + + z_size = len(dlrm.top_l) + print("z_size", z_size) + for i in range(0, z_size): + all_z.append([]) + + for j, (X, lS_o, lS_i, T) in enumerate(data_ld): + + if j < offset: + continue + + if j >= max_size+offset: + break + + af, x, cat, t, c, z, p = dlrm_output_wrap(dlrm, X, lS_o, lS_i, T) + + all_features.append(af) + all_X.append(x) + all_cat.append(cat) + all_T.append(t) + all_c.append(c) + all_pred.append(p) + + for i in range(0, z_size): + all_z[i].append(z[i]) + +# # calculate classifier metrics + ac = accuracy_score(all_T, all_pred) + f1 = f1_score(all_T, all_pred) + ps = precision_score(all_T, all_pred) + rc = recall_score(all_T, all_pred) + + print(info, "accuracy", ac, "f1", f1, "precision", ps, "recall", rc) + + return all_features, all_X, all_cat, all_T, all_z, all_c, all_pred + + +def plot_all_data_3(umap_Y, + umap_T, + train_Y = None, + train_T = None, + test_Y = None, + test_T = None, + total_train_size = "", + total_test_size = "", + info = "", + output_dir = "", + orig_space_dim = 0): + + size = 1 + colors = ["red","green"] + + fig, (ax0, ax1, ax2) = plt.subplots(1, 3) + fig.suptitle("UMAP: " + info + " space dim "+str(orig_space_dim)) + + ax0.scatter(umap_Y[:,0], umap_Y[:,1], s=size, c=umap_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) + ax0.set_title("UMAP ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7) + + if train_Y is not None and train_T is not None: + ax1.scatter(train_Y[:,0], train_Y[:,1], s=size, c=train_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) + ax1.set_title("Train ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7) + + if test_Y is not None and test_T is not None: + ax2.scatter(test_Y[:,0], test_Y[:,1], s=size, c=test_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) + ax2.set_title("Test ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7) + + plt.savefig(output_dir+"/"+info+"-umap.png") + plt.close() + + +def plot_one_class_3(umap_Y, + umap_T, + train_Y, + train_T, + test_Y, + test_T, + target = 0, + col = "red", + total_train_size = "", + total_test_size = "", + info = "", + output_dir = "", + orig_space_dim = 0): + + size = 1 + + fig, (ax0, ax1, ax2) = plt.subplots(1, 3) + fig.suptitle("UMAP: "+ info + " space dim "+str(orig_space_dim)) + + ind_l_umap = [i for i,x in enumerate(umap_T) if x == target] + Y_umap_l = np.array([umap_Y[i,:] for i in ind_l_umap]) + + ax0.scatter(Y_umap_l[:,0], Y_umap_l[:,1], s=size, c=col, marker=".", linewidth=0) + ax0.set_title("UMAP, ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7) + + if train_Y is not None and train_T is not None: + ind_l_test = [i for i,x in enumerate(train_T) if x == target] + Y_test_l = np.array([train_Y[i,:] for i in ind_l_test]) + + ax1.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0) + ax1.set_title("Train, ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7) + + if test_Y is not None and test_T is not None: + ind_l_test = [i for i,x in enumerate(test_T) if x == target] + Y_test_l = np.array([test_Y[i,:] for i in ind_l_test]) + + ax2.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0) + ax2.set_title("Test, ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7) + + plt.savefig(output_dir+"/"+info+"-umap.png") + plt.close() + + +def visualize_umap_data(umap_Y, + umap_T, + umap_C, + umap_P, + train_Y, + train_T, + train_C, + train_P, + test_Y = None, + test_T = None, + test_C = None, + test_P = None, + total_train_size = "", + total_test_size = "", + info = "", + output_dir = "", + orig_space_dim = 0): + + # all classes + plot_all_data_3(umap_Y = umap_Y, + umap_T = umap_T, + train_Y = train_Y, + train_T = train_T, + test_Y = test_Y, + test_T = test_T, + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info, + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # all predictions + plot_all_data_3(umap_Y = umap_Y, + umap_T = umap_P, + train_Y = train_Y, + train_T = train_P, + test_Y = test_Y, + test_T = test_P, + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info+", all-predictions", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + + # class 0 + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_T, + train_Y = train_Y, + train_T = train_T, + test_Y = test_Y, + test_T = test_T, + target = 0, + col = "red", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info+" class " + str(0), + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # class 1 + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_T, + train_Y = train_Y, + train_T = train_T, + test_Y = test_Y, + test_T = test_T, + target = 1, + col = "green", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " class " + str(1), + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # correct classification + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_C, + train_Y = train_Y, + train_T = train_C, + test_Y = test_Y, + test_T = test_C, + target = 0, + col = "green", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " correct ", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # errors + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_C, + train_Y = train_Y, + train_T = train_C, + test_Y = test_Y, + test_T = test_C, + target = 1, + col = "red", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " errors ", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # prediction 0 + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_P, + train_Y = train_Y, + train_T = train_P, + test_Y = test_Y, + test_T = test_P, + target = 0, + col = "red", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " predict-0 ", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # prediction 1 + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_P, + train_Y = train_Y, + train_T = train_P, + test_Y = test_Y, + test_T = test_P, + target = 1, + col = "green", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " predict-1 ", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + +def hdbscan_clustering(umap_data, train_data, test_data, info="", output_dir=""): + + clusterer = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500, prediction_data=True) + umap_labels = clusterer.fit_predict(umap_data) + train_labels, _ = hdbscan.approximate_predict(clusterer, train_data) + test_labels, _ = hdbscan.approximate_predict(clusterer, test_data) + + fig, ((ax00, ax01, ax02), (ax10, ax11, ax12)) = plt.subplots(2, 3) + fig.suptitle("HDBSCAN clastering: "+ info ) + + # plot umap data + umap_clustered = (umap_labels >= 0) + umap_coll = collections.Counter(umap_clustered) + print("umap_clustered", umap_coll) +# print("umap_data", umap_data.shape) +# print("~umap_clustered", umap_clustered.count(False), ~umap_clustered) + ax00.scatter(umap_data[~umap_clustered, 0], + umap_data[~umap_clustered, 1], + c=(0.5, 0.5, 0.5), + s=0.1, + alpha=0.5) + ax00.set_title("UMAP Outliers " + str(umap_coll[False]), fontsize=7) + ax10.scatter(umap_data[umap_clustered, 0], + umap_data[umap_clustered, 1], + c=umap_labels[umap_clustered], + s=0.1, + cmap="Spectral") + ax10.set_title("UMAP Inliers " + str(umap_coll[True]), fontsize=7) + + # plot train data + train_clustered = (train_labels >= 0) + train_coll = collections.Counter(train_clustered) + ax01.scatter(train_data[~train_clustered, 0], + train_data[~train_clustered, 1], + c=(0.5, 0.5, 0.5), + s=0.1, + alpha=0.5) + ax01.set_title("Train Outliers " + str(train_coll[False]), fontsize=7) + ax11.scatter(train_data[train_clustered, 0], + train_data[train_clustered, 1], + c=train_labels[train_clustered], + s=0.1, + cmap="Spectral") + ax11.set_title("Train Inliers " + str(train_coll[True]), fontsize=7) + + # plot test data + test_clustered = (test_labels >= 0) + test_coll = collections.Counter(test_clustered) + ax02.scatter(test_data[~test_clustered, 0], + test_data[~test_clustered, 1], + c=(0.5, 0.5, 0.5), + s=0.1, + alpha=0.5) + ax02.set_title("Tets Outliers " + str(test_coll[False]), fontsize=7) + ax12.scatter(test_data[test_clustered, 0], + test_data[test_clustered, 1], + c=test_labels[test_clustered], + s=0.1, + cmap="Spectral") + ax12.set_title("Test Inliers " + str(test_coll[True]), fontsize=7) + + plt.savefig(output_dir+"/"+info+"-hdbscan.png") + plt.close() + + +def visualize_all_data_umap(dlrm, + train_ld, + test_ld = None, + max_umap_size = 50000, + output_dir = "", + umap_metric = "euclidean"): + + data_ratio = 1 + + print("creating umap data") + umap_train_feat, umap_train_X, umap_train_cat, umap_train_T, umap_train_z, umap_train_c, umap_train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size, offset=0, info="umap") + + # transform train and test data + train_feat, train_X, train_cat, train_T, train_z, train_c, train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size*data_ratio, offset=max_umap_size, info="train") + test_feat, test_X, test_cat, test_T, test_z, test_c, test_p = create_umap_data(dlrm=dlrm, data_ld=test_ld, max_size=max_umap_size*data_ratio, offset=0, info="test") + + print("umap_train_feat", np.array(umap_train_feat).shape) + reducer_all_feat = umap.UMAP(random_state=42, metric=umap_metric) + umap_feat_Y = reducer_all_feat.fit_transform(umap_train_feat) + + train_feat_Y = reducer_all_feat.transform(train_feat) + test_feat_Y = reducer_all_feat.transform(test_feat) + + visualize_umap_data(umap_Y = umap_feat_Y, + umap_T = umap_train_T, + umap_C = umap_train_c, + umap_P = umap_train_p, + train_Y = train_feat_Y, + train_T = train_T, + train_C = train_c, + train_P = train_p, + test_Y = test_feat_Y, + test_T = test_T, + test_C = test_c, + test_P = test_p, + total_train_size = str(len(train_ld)), + total_test_size = str(len(test_ld)), + info = "all-features", + output_dir = output_dir, + orig_space_dim = np.array(umap_train_feat).shape[1]) + + hdbscan_clustering(umap_data = umap_feat_Y, + train_data = train_feat_Y, + test_data = test_feat_Y, + info = "umap-all-features", + output_dir = output_dir) + +# hdbscan_clustering(umap_data = np.array(umap_train_feat), +# train_data = np.array(train_feat), +# test_data = np.array(test_feat), +# info = "all-features", +# output_dir = output_dir) + + print("umap_train_X", np.array(umap_train_X).shape) + reducer_X = umap.UMAP(random_state=42, metric=umap_metric) + umap_X_Y = reducer_X.fit_transform(umap_train_X) + + train_X_Y = reducer_X.transform(train_X) + test_X_Y = reducer_X.transform(test_X) + + visualize_umap_data(umap_Y = umap_X_Y, + umap_T = umap_train_T, + umap_C = umap_train_c, + umap_P = umap_train_p, + train_Y = train_X_Y, + train_T = train_T, + train_C = train_c, + train_P = train_p, + test_Y = test_X_Y, + test_T = test_T, + test_C = test_c, + test_P = test_p, + total_train_size = str(len(train_ld)), + total_test_size = str(len(test_ld)), + info = "cont-features", + output_dir = output_dir, + orig_space_dim = np.array(umap_train_X).shape[1]) + + print("umap_train_cat", np.array(umap_train_cat).shape) + reducer_cat = umap.UMAP(random_state=42, metric=umap_metric) + umap_cat_Y = reducer_cat.fit_transform(umap_train_cat) + + train_cat_Y = reducer_cat.transform(train_cat) + test_cat_Y = reducer_cat.transform(test_cat) + + visualize_umap_data(umap_Y = umap_cat_Y, + umap_T = umap_train_T, + umap_C = umap_train_c, + umap_P = umap_train_p, + train_Y = train_cat_Y, + train_T = train_T, + train_C = train_c, + train_P = train_p, + test_Y = test_cat_Y, + test_T = test_T, + test_C = test_c, + test_P = test_p, + total_train_size = str(len(train_ld)), + total_test_size = str(len(test_ld)), + info = "cat-features", + output_dir = output_dir, + orig_space_dim = np.array(umap_train_cat).shape[1]) + + # UMAP for z data + for i in range(0,len(umap_train_z)): + print("z", i, np.array(umap_train_z[i]).shape) + reducer_z = umap.UMAP(random_state=42, metric=umap_metric) + umap_z_Y = reducer_z.fit_transform(umap_train_z[i]) + + train_z_Y = reducer_z.transform(train_z[i]) + test_z_Y = reducer_z.transform(test_z[i]) + + visualize_umap_data(umap_Y = umap_z_Y, + umap_T = umap_train_T, + umap_C = umap_train_c, + umap_P = umap_train_p, + train_Y = train_z_Y, + train_T = train_T, + train_C = train_c, + train_P = train_p, + test_Y = test_z_Y, + test_T = test_T, + test_C = test_c, + test_P = test_p, + total_train_size = str(len(train_ld)), + total_test_size = str(len(test_ld)), + info = "z-features-"+str(i), + output_dir = output_dir, + orig_space_dim = np.array(umap_train_z[i]).shape[1]) + + +def analyze_model_data(output_dir, + dlrm, + train_ld, + test_ld, + train_data, + skip_embedding = False, + use_tsne = False, + max_umap_size = 50000, + max_tsne_size = 10000, + skip_categorical_analysis = False, + skip_data_plots = False, + umap_metric = "euclidean"): + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if skip_embedding is False: + + cat_counts = None + + cat_counts = analyse_categorical_counts(X_cat=train_data.X_cat, emb_l=dlrm.emb_l, output_dir=output_dir) + + visualize_embeddings_umap(emb_l = dlrm.emb_l, + output_dir = output_dir, + max_size = max_umap_size, + umap_metric = umap_metric, + cat_counts = cat_counts) + + if use_tsne is True: + visualize_embeddings_tsne(emb_l = dlrm.emb_l, + output_dir = output_dir, + max_size = max_tsne_size) + + # data visualization and analysis + if skip_data_plots is False: + visualize_all_data_umap(dlrm=dlrm, train_ld=train_ld, test_ld=test_ld, max_umap_size=max_umap_size, output_dir=output_dir, umap_metric=umap_metric) + + # analyse categorical variables + if skip_categorical_analysis is False and args.data_randomize == "none": + analyse_categorical_data(X_cat=train_data.X_cat, n_days=10, output_dir=output_dir) + + + +if __name__ == "__main__": + + output_dir = "" + + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Exploratory DLRM analysis" + ) + + parser.add_argument("--load-model", type=str, default="") + parser.add_argument("--data-set", choices=["kaggle", "terabyte"], help="dataset") +# parser.add_argument("--dataset-path", required=True, help="path to the dataset") + parser.add_argument("--max-ind-range", type=int, default=-1) +# parser.add_argument("--mlperf-bin-loader", action="store_true", default=False) + parser.add_argument("--output-dir", type=str, default="") + parser.add_argument("--skip-embedding", action="store_true", default=False) + parser.add_argument("--umap-metric", type=str, default="euclidean") + parser.add_argument("--skip-data-plots", action="store_true", default=False) + parser.add_argument("--skip-categorical-analysis", action="store_true", default=False) + + # umap relatet + parser.add_argument("--max-umap-size", type=int, default=50000) + # tsne related + parser.add_argument("--use-tsne", action="store_true", default=False) + parser.add_argument("--max-tsne-size", type=int, default=1000) + # data file related + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--data-randomize", type=str, default="total") # none, total or day or none + parser.add_argument("--memory-map", action="store_true", default=False) + parser.add_argument("--mini-batch-size", type=int, default=1) + parser.add_argument("--num-workers", type=int, default=0) + parser.add_argument("--test-mini-batch-size", type=int, default=1) + parser.add_argument("--test-num-workers", type=int, default=0) + parser.add_argument("--num-batches", type=int, default=0) + # mlperf logging (disables other output and stops early) + parser.add_argument("--mlperf-logging", action="store_true", default=False) + + args = parser.parse_args() + + print("command line args: ", json.dumps(vars(args))) + + if output_dir == "": + output_dir = args.data_set+"-"+os.path.split(args.load_model)[-1]+"-vis_all" + print("output_dir:", output_dir) + + if args.data_set == "kaggle": + # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh) + m_spa=16 + ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572]) + ln_bot=np.array([13,512,256,64,16]) + ln_top=np.array([367,512,256,1]) + + elif args.dataset == "terabyte": + + if args.max_ind_range == 10000000: + # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000) + m_spa=64 + ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36]) + ln_bot=np.array([13,512,256,64]) + ln_top=np.array([415,512,512,256,1]) + elif args.max_ind_range == 40000000: + # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000) + m_spa=128 + ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36]) + ln_bot=np.array([13,512,256,128]) + ln_top=np.array([479,1024,1024,512,256,1]) + else: + raise ValueError("only --max-in-range 10M or 40M is supported") + else: + raise ValueError("only kaggle|terabyte dataset options are supported") + + # check input parameters + if args.data_randomize != "none" and args.skip_categorical_analysis is not True: + print("Incorrect option for categoricat analysis, use: --data-randomize=none") + sys.exit(-1) + + dlrm = DLRM_Net( + m_spa, + ln_emb, + ln_bot, + ln_top, + arch_interaction_op="dot", + arch_interaction_itself=False, + sigmoid_bot=-1, + sigmoid_top=ln_top.size - 2, + sync_dense_params=True, + loss_threshold=0.0, + ndevices=-1, + qr_flag=False, + qr_operation=None, + qr_collisions=None, + qr_threshold=None, + md_flag=False, + md_threshold=None, + ) + + # Load model is specified + if not (args.load_model == ""): + print("Loading saved model {}".format(args.load_model)) + + ld_model = torch.load(args.load_model, map_location=torch.device("cpu")) + dlrm.load_state_dict(ld_model["state_dict"]) + + print("Model loaded", args.load_model) + #print(dlrm) + + z_size = len(dlrm.top_l) + for i in range(0, z_size): + print("z", i, dlrm.top_l[i]) + + # load data + train_data = None + test_data = None + + if args.raw_data_file is not "" or args.processed_data_file is not "": + train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args) + + analyze_model_data(output_dir = output_dir, + dlrm = dlrm, + train_ld = train_ld, + test_ld = test_ld, + train_data = train_data, + skip_embedding = args.skip_embedding, + use_tsne = args.use_tsne, + max_umap_size = args.max_umap_size, + max_tsne_size = args.max_tsne_size, + skip_categorical_analysis = args.skip_categorical_analysis, + skip_data_plots = args.skip_data_plots, + umap_metric = args.umap_metric) + diff --git a/benchmarks/dlrm/ootb/tricks/md_embedding_bag.py b/benchmarks/dlrm/ootb/tricks/md_embedding_bag.py new file mode 100644 index 0000000..7c4071a --- /dev/null +++ b/benchmarks/dlrm/ootb/tricks/md_embedding_bag.py @@ -0,0 +1,81 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Mixed-Dimensions Trick +# +# Description: Applies mixed dimension trick to embeddings to reduce +# embedding sizes. +# +# References: +# [1] Antonio Ginart, Maxim Naumov, Dheevatsa Mudigere, Jiyan Yang, James Zou, +# "Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation +# Systems", CoRR, arXiv:1909.11810, 2019 +from __future__ import absolute_import, division, print_function, unicode_literals +import torch +import torch.nn as nn + + +def md_solver(n, alpha, d0=None, B=None, round_dim=True, k=None): + ''' + An external facing function call for mixed-dimension assignment + with the alpha power temperature heuristic + Inputs: + n -- (torch.LongTensor) ; Vector of num of rows for each embedding matrix + alpha -- (torch.FloatTensor); Scalar, non-negative, controls dim. skew + d0 -- (torch.FloatTensor); Scalar, baseline embedding dimension + B -- (torch.FloatTensor); Scalar, parameter budget for embedding layer + round_dim -- (bool); flag for rounding dims to nearest pow of 2 + k -- (torch.LongTensor) ; Vector of average number of queries per inference + ''' + n, indices = torch.sort(n) + k = k[indices] if k is not None else torch.ones(len(n)) + d = alpha_power_rule(n.type(torch.float) / k, alpha, d0=d0, B=B) + if round_dim: + d = pow_2_round(d) + undo_sort = [0] * len(indices) + for i, v in enumerate(indices): + undo_sort[v] = i + return d[undo_sort] + + +def alpha_power_rule(n, alpha, d0=None, B=None): + if d0 is not None: + lamb = d0 * (n[0].type(torch.float) ** alpha) + elif B is not None: + lamb = B / torch.sum(n.type(torch.float) ** (1 - alpha)) + else: + raise ValueError("Must specify either d0 or B") + d = torch.ones(len(n)) * lamb * (n.type(torch.float) ** (-alpha)) + for i in range(len(d)): + if i == 0 and d0 is not None: + d[i] = d0 + else: + d[i] = 1 if d[i] < 1 else d[i] + return (torch.round(d).type(torch.long)) + + +def pow_2_round(dims): + return 2 ** torch.round(torch.log2(dims.type(torch.float))) + + +class PrEmbeddingBag(nn.Module): + def __init__(self, num_embeddings, embedding_dim, base_dim): + super(PrEmbeddingBag, self).__init__() + self.embs = nn.EmbeddingBag( + num_embeddings, embedding_dim, mode="sum", sparse=True) + torch.nn.init.xavier_uniform_(self.embs.weight) + if embedding_dim < base_dim: + self.proj = nn.Linear(embedding_dim, base_dim, bias=False) + torch.nn.init.xavier_uniform_(self.proj.weight) + elif embedding_dim == base_dim: + self.proj = nn.Identity() + else: + raise ValueError( + "Embedding dim " + str(embedding_dim) + " > base dim " + str(base_dim) + ) + + def forward(self, input, offsets=None, per_sample_weights=None): + return self.proj(self.embs( + input, offsets=offsets, per_sample_weights=per_sample_weights)) diff --git a/benchmarks/dlrm/ootb/tricks/qr_embedding_bag.py b/benchmarks/dlrm/ootb/tricks/qr_embedding_bag.py new file mode 100644 index 0000000..290d795 --- /dev/null +++ b/benchmarks/dlrm/ootb/tricks/qr_embedding_bag.py @@ -0,0 +1,185 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Quotient-Remainder Trick +# +# Description: Applies quotient remainder-trick to embeddings to reduce +# embedding sizes. +# +# References: +# [1] Hao-Jun Michael Shi, Dheevatsa Mudigere, Maxim Naumov, Jiyan Yang, +# "Compositional Embeddings Using Complementary Partitions for Memory-Efficient +# Recommendation Systems", CoRR, arXiv:1909.02107, 2019 + + +from __future__ import absolute_import, division, print_function, unicode_literals +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.parameter import Parameter +import numpy as np + + +class QREmbeddingBag(nn.Module): + r"""Computes sums or means over two 'bags' of embeddings, one using the quotient + of the indices and the other using the remainder of the indices, without + instantiating the intermediate embeddings, then performs an operation to combine these. + + For bags of constant length and no :attr:`per_sample_weights`, this class + + * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``, + * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``, + * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``. + + However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these + operations. + + QREmbeddingBag also supports per-sample weights as an argument to the forward + pass. This scales the output of the Embedding before performing a weighted + reduction as specified by ``mode``. If :attr:`per_sample_weights`` is passed, the + only supported ``mode`` is ``"sum"``, which computes a weighted sum according to + :attr:`per_sample_weights`. + + Known Issues: + Autograd breaks with multiple GPUs. It breaks only with multiple embeddings. + + Args: + num_categories (int): total number of unique categories. The input indices must be in + 0, 1, ..., num_categories - 1. + embedding_dim (list): list of sizes for each embedding vector in each table. If ``"add"`` + or ``"mult"`` operation are used, these embedding dimensions must be + the same. If a single embedding_dim is used, then it will use this + embedding_dim for both embedding tables. + num_collisions (int): number of collisions to enforce. + operation (string, optional): ``"concat"``, ``"add"``, or ``"mult". Specifies the operation + to compose embeddings. ``"concat"`` concatenates the embeddings, + ``"add"`` sums the embeddings, and ``"mult"`` multiplies + (component-wise) the embeddings. + Default: ``"mult"`` + max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm` + is renormalized to have norm :attr:`max_norm`. + norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``. + scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of + the words in the mini-batch. Default ``False``. + Note: this option is not supported when ``mode="max"``. + mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag. + ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights` + into consideration. ``"mean"`` computes the average of the values + in the bag, ``"max"`` computes the max value over each bag. + Default: ``"mean"`` + sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See + Notes for more details regarding sparse gradients. Note: this option is not + supported when ``mode="max"``. + + Attributes: + weight (Tensor): the learnable weights of each embedding table is the module of shape + `(num_embeddings, embedding_dim)` initialized using a uniform distribution + with sqrt(1 / num_categories). + + Inputs: :attr:`input` (LongTensor), :attr:`offsets` (LongTensor, optional), and + :attr:`per_index_weights` (Tensor, optional) + + - If :attr:`input` is 2D of shape `(B, N)`, + + it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and + this will return ``B`` values aggregated in a way depending on the :attr:`mode`. + :attr:`offsets` is ignored and required to be ``None`` in this case. + + - If :attr:`input` is 1D of shape `(N)`, + + it will be treated as a concatenation of multiple bags (sequences). + :attr:`offsets` is required to be a 1D tensor containing the + starting index positions of each bag in :attr:`input`. Therefore, + for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as + having ``B`` bags. Empty bags (i.e., having 0-length) will have + returned vectors filled by zeros. + + per_sample_weights (Tensor, optional): a tensor of float / double weights, or None + to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights` + must have exactly the same shape as input and is treated as having the same + :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``. + + + Output shape: `(B, embedding_dim)` + + """ + __constants__ = ['num_categories', 'embedding_dim', 'num_collisions', + 'operation', 'max_norm', 'norm_type', 'scale_grad_by_freq', + 'mode', 'sparse'] + + def __init__(self, num_categories, embedding_dim, num_collisions, + operation='mult', max_norm=None, norm_type=2., + scale_grad_by_freq=False, mode='mean', sparse=False, + _weight=None): + super(QREmbeddingBag, self).__init__() + + assert operation in ['concat', 'mult', 'add'], 'Not valid operation!' + + self.num_categories = num_categories + if isinstance(embedding_dim, int) or len(embedding_dim) == 1: + self.embedding_dim = [embedding_dim, embedding_dim] + else: + self.embedding_dim = embedding_dim + self.num_collisions = num_collisions + self.operation = operation + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + + if self.operation == 'add' or self.operation == 'mult': + assert self.embedding_dim[0] == self.embedding_dim[1], \ + 'Embedding dimensions do not match!' + + self.num_embeddings = [int(np.ceil(num_categories / num_collisions)), + num_collisions] + + if _weight is None: + self.weight_q = Parameter(torch.Tensor(self.num_embeddings[0], self.embedding_dim[0])) + self.weight_r = Parameter(torch.Tensor(self.num_embeddings[1], self.embedding_dim[1])) + self.reset_parameters() + else: + assert list(_weight[0].shape) == [self.num_embeddings[0], self.embedding_dim[0]], \ + 'Shape of weight for quotient table does not match num_embeddings and embedding_dim' + assert list(_weight[1].shape) == [self.num_embeddings[1], self.embedding_dim[1]], \ + 'Shape of weight for remainder table does not match num_embeddings and embedding_dim' + self.weight_q = Parameter(_weight[0]) + self.weight_r = Parameter(_weight[1]) + self.mode = mode + self.sparse = sparse + + def reset_parameters(self): + nn.init.uniform_(self.weight_q, np.sqrt(1 / self.num_categories)) + nn.init.uniform_(self.weight_r, np.sqrt(1 / self.num_categories)) + + def forward(self, input, offsets=None, per_sample_weights=None): + input_q = (input / self.num_collisions).long() + input_r = torch.remainder(input, self.num_collisions).long() + + embed_q = F.embedding_bag(input_q, self.weight_q, offsets, self.max_norm, + self.norm_type, self.scale_grad_by_freq, self.mode, + self.sparse, per_sample_weights) + embed_r = F.embedding_bag(input_r, self.weight_r, offsets, self.max_norm, + self.norm_type, self.scale_grad_by_freq, self.mode, + self.sparse, per_sample_weights) + + if self.operation == 'concat': + embed = torch.cat((embed_q, embed_r), dim=1) + elif self.operation == 'add': + embed = embed_q + embed_r + elif self.operation == 'mult': + embed = embed_q * embed_r + + return embed + + def extra_repr(self): + s = '{num_embeddings}, {embedding_dim}' + if self.max_norm is not None: + s += ', max_norm={max_norm}' + if self.norm_type != 2: + s += ', norm_type={norm_type}' + if self.scale_grad_by_freq is not False: + s += ', scale_grad_by_freq={scale_grad_by_freq}' + s += ', mode={mode}' + return s.format(**self.__dict__) diff --git a/benchmarks/dlrm/ubench/README_comms.md b/benchmarks/dlrm/ubench/README_comms.md new file mode 100644 index 0000000..5a76db0 --- /dev/null +++ b/benchmarks/dlrm/ubench/README_comms.md @@ -0,0 +1,5 @@ +# dlrm_ubench_comms_driver.py runs /param/train/comms/pt/comms.py. + +# Note +If /param is empty, change to that directory and run: +git submodule update --init --recursive diff --git a/benchmarks/dlrm/ubench/dlrm_ubench_comms_driver.py b/benchmarks/dlrm/ubench/dlrm_ubench_comms_driver.py new file mode 100644 index 0000000..e157fc0 --- /dev/null +++ b/benchmarks/dlrm/ubench/dlrm_ubench_comms_driver.py @@ -0,0 +1,130 @@ +import argparse +import contextlib +import io +import itertools +import os +import pathlib +import subprocess +import sys +from itertools import product +from os import fspath + +# param ubenches +p = pathlib.Path(__file__).parent.resolve() / "../../../param/train/compute/pt" +sys.path.append(fspath(p)) +import dataset +import pytorch_emb as kemb +import pytorch_gemm as kgemm +import pytorch_linear as klinear + +# FB5 Logger +p = pathlib.Path(__file__).parent.resolve() / "../../../fb5logging" +sys.path.append(fspath(p)) +import loggerconstants +from fb5logger import FB5Logger + + +def main(): + parser = argparse.ArgumentParser(description="comms.py driver") + parser.add_argument( + "--size", + type=str, + default="small", + ) + parser.add_argument( + "--backend", + type=str, + default=("nccl"), + choices=["nccl", "gloo", "mpi", "ucc", "xla"], + ) + parser.add_argument( + "--collective", + type=str, + default=("all_to_all"), + choices=["all_to_all", "all_reduce"], + ) + parser.add_argument("--fb5logger", type=str, default=None) + args = parser.parse_args() + + if args.size not in ["small", "medium", "large"] and not ( + args.size.isdigit() and int(args.size) > 0 + ): + sys.exit("The --size argument provided is not a valid positive integer.") + + lookup = { + "small": 2200 if args.collective == "all_reduce" else 134000000, + "medium": 9944 if args.collective == "all_reduce" else 244000000, + "large": 22372 if args.collective == "all_reduce" else 544000000, + str(2200): "small" if args.collective == "all_reduce" else 2200, + str(9944): "medium" if args.collective == "all_reduce" else 9944, + str(22372): "large" if args.collective == "all_reduce" else 22372, + str(134000000): "small" if args.collective == "all_to_all" else 134000000, + str(244000000): "medium" if args.collective == "all_to_all" else 244000000, + str(544000000): "large" if args.collective == "all_to_all" else 544000000, + } + (x, y) = (args.size, lookup.get(args.size, args.size)) + (size, name) = (x, y) if args.size.isdigit() else (y, x) + + master_ip = "localhost" + num_compute_per_collective = 100 + mm_dim = 1000 + num_iter = 100 + + cmd = f""" + --f 2 + --n {num_iter} + --master-ip {master_ip} + --master-port 22565 + --collective {args.collective} + --b {size} + --e {size} + --num-compute {num_compute_per_collective} + --mm-dim {mm_dim} + --backend {args.backend} + """ + sys.argv = cmd.replace("\n", " ").replace(" ", "").split() + + print("") + comms_abs_dir_path = str( + pathlib.Path(__file__).absolute().parents[3].resolve() / "param/train/comms/pt" + ) + sys.path.append(comms_abs_dir_path) + from comms import main as comms_main + + fb5logger = FB5Logger(args.fb5logger) + fb5logger.header( + "DLRM", + "UBENCH", + "train", + "comms_" + args.collective.replace("_", "") + "_" + name, + score_metric=loggerconstants.GBPS, + ) + + comms_stdout = io.StringIO() + with contextlib.redirect_stdout(comms_stdout): + fb5logger.run_start() + comms_main() + + output = comms_stdout.getvalue().split("\n")[-3:] + output = [_.split("\t") for _ in output] + output[1].insert(4, "") + output[0][4] = "Latency(us):" + output[0].insert(5, "p50") + output[0].pop(7) + output[0].pop(0) + output[1].pop(0) + extra_metadata = {} + for a, b in zip(output[0], output[1]): + extra_metadata[a.lstrip()] = b.lstrip() + fb5logger.run_stop( + num_batches=num_iter, batch_size=None, extra_metadata=extra_metadata + ) + + print(comms_stdout.getvalue()) + print("-- Pretty Format --") + for a, b in zip(output[0], output[1]): + print("{:<15s}{:>4s}".format(a.lstrip(), b.lstrip())) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/dlrm/ubench/dlrm_ubench_train_driver.py b/benchmarks/dlrm/ubench/dlrm_ubench_train_driver.py new file mode 100644 index 0000000..15f407c --- /dev/null +++ b/benchmarks/dlrm/ubench/dlrm_ubench_train_driver.py @@ -0,0 +1,122 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import sys +import pathlib +from os import fspath +# param ubenches +p = pathlib.Path(__file__).parent.resolve() / "../../../param/train/compute/pt" +sys.path.append(fspath(p)) +import dataset +import pytorch_gemm as kgemm +import pytorch_emb as kemb +import pytorch_linear as klinear + +# FB5 Logger +p = pathlib.Path(__file__).parent.resolve() / "../../../fb5logging" +sys.path.append(fspath(p)) +from fb5logger import FB5Logger +import loggerconstants + +if __name__ == "__main__": + + import argparse + + parser = argparse.ArgumentParser( + description="Measuring the Compute Kernel Performance Using PyTorch" + ) + parser.add_argument('--warmups', type=int, default=10, help="warmup times") + parser.add_argument('--steps', type=int, default=100, help="repeat times") + parser.add_argument('--device', type=str, choices=['cpu', 'gpu', 'tpu'], required=True, help='valid devices') + parser.add_argument("--fb5logger", type=str, default=None) + + subparsers = parser.add_subparsers(title='kernels', dest='kernel') + subparsers.required = True + + parser_emb = subparsers.add_parser('emb', help='measure EmbeddingBag performance') + parser_emb.add_argument('-d', '--dataset', default='B') + parser_emb.add_argument("--randomseed", type=int, default=0) + parser_emb.add_argument("--usexlabag", action='store_true', help='use xlabad instead of embeddingbag') + parser_emb.add_argument("--alpha", default=0.0, help="Zipf param. Use uniform if == 0.0") + + parser_linear = subparsers.add_parser('linear', help='measure mlp performance') + parser_linear.add_argument('--optimizer-type', default='sgd', help='Optimizer: SGD', choices=['sgd']) + parser_linear.add_argument('-t', '--dtype', default='float', help="data type", choices=["float", "float16", "bfloat16"]) + parser_linear.add_argument('-d', '--dataset', default='small') + + # FB5 Logging + + args=parser.parse_args() + + print("Measuring the performance of ", args.kernel, " on device = ", args.device) + print("Steps = ", args.steps, " warmups = ", args.warmups) + + #fb5 logging header + if args.fb5logger is not None: + fb5logger = FB5Logger(args.fb5logger) + + if args.kernel == 'emb': + print("with emb dataset ", args.dataset) + global_bytes = 0 + global_elap = 0 + if args.fb5logger is not None: + fb5logger.header("DLRM", "UBENCH", "train", args.kernel + "_" + args.dataset, score_metric=loggerconstants.GBPS) + fb5logger.run_start() + if args.dataset == 'A': + run_dataset = dataset.emb_A + elif args.dataset == 'B': + run_dataset = dataset.emb_B + elif args.dataset == 'small': + small_dataset = [ (4800000, 56, 34, 2048), + (4800000, 56, 34, 4096),] + run_dataset = small_dataset + else: + import ast + run_dataset = ast.literal_eval(args.dataset) + for i in range(len(run_dataset)): + features, embdim, nnz, batch = run_dataset[i] + elap, total_bytes = kemb.run_single(args, features, embdim, nnz, batch) + elap /= args.steps + total_bytes /= 1.0e6 + global_bytes += total_bytes + global_elap += elap + if args.fb5logger is not None: + extra_metadata={"GB/s": global_bytes / global_elap / 1.0e3, "ELAP": global_elap, "BYTES": global_bytes} + fb5logger.run_stop(args.steps, batch, extra_metadata=extra_metadata) + else: + print("with linear dataset ", args.dataset, ", Data type: ", args.dtype) + global_flops = 0 + global_elap = 0 + if args.fb5logger is not None: + fb5logger.header("DLRM", "UBENCH", "train", args.kernel + "_" + args.dataset, score_metric=loggerconstants.TFPS) + fb5logger.run_start() + if args.dataset == 'A': + run_dataset = dataset.mlp_A + elif args.dataset == 'small': + small_dataset = [ (18, 1024, 1024, 1024, 128), + (18, 1024, 1024, 1024, 256),] + run_dataset = small_dataset + else: + import ast + run_dataset = ast.literal_eval(args.dataset) + for i in range(len(run_dataset)): + layer_num, input_size, hidden_size, output_size, batch_size = run_dataset[i] + elap, loss = klinear.run_single( + args, layer_num, input_size, hidden_size, output_size, batch_size + ) + elap /= args.steps + + flops = batch_size * ( + hidden_size * hidden_size * layer_num + + hidden_size * input_size + + hidden_size * output_size + ) + # Forward 2x and Backward 4x + flops *= 6 + global_flops += flops + global_elap += elap + if args.fb5logger is not None: + extra_metadata={"TF/s": global_flops / global_elap / 1.0e12, "ELAP": global_elap, "FLOPS": global_flops} + fb5logger.run_stop(args.steps, batch_size, extra_metadata=extra_metadata) diff --git a/benchmarks/rnnt/ootb/inference/QSL.py b/benchmarks/rnnt/ootb/inference/QSL.py new file mode 100644 index 0000000..3848ca3 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/QSL.py @@ -0,0 +1,71 @@ +import sys +import os +from os import fspath +import pathlib +sys.path.insert(0, fspath(pathlib.Path(__file__).parent.resolve() / "./pytorch")) + +from parts.manifest import Manifest +from parts.segment import AudioSegment + +import numpy as np + +import mlperf_loadgen as lg + + +class AudioQSL: + def __init__(self, dataset_dir, manifest_filepath, labels, + sample_rate=16000, perf_count=None): + m_paths = [manifest_filepath] + self.manifest = Manifest(dataset_dir, m_paths, labels, len(labels), + normalize=True, max_duration=15.0) + self.sample_rate = sample_rate + self.count = len(self.manifest) + perf_count = self.count if perf_count is None else perf_count + self.sample_id_to_sample = {} + self.qsl = lg.ConstructQSL(self.count, perf_count, + self.load_query_samples, + self.unload_query_samples) + print( + "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours. Number of samples: {2}".format( + self.manifest.duration / 3600, + self.manifest.filtered_duration / 3600, + self.count)) + + def load_query_samples(self, sample_list): + for sample_id in sample_list: + self.sample_id_to_sample[sample_id] = self._load_sample(sample_id) + + def unload_query_samples(self, sample_list): + for sample_id in sample_list: + del self.sample_id_to_sample[sample_id] + + def _load_sample(self, index): + sample = self.manifest[index] + segment = AudioSegment.from_file(sample['audio_filepath'][0], + target_sr=self.sample_rate) + waveform = segment.samples + assert isinstance(waveform, np.ndarray) and waveform.dtype == np.float32 + return waveform + + def __getitem__(self, index): + return self.sample_id_to_sample[index] + + def __del__(self): + lg.DestroyQSL(self.qsl) + print("Finished destroying QSL.") + + +# We have no problem fitting all data in memory, so we do that, in +# order to speed up execution of the benchmark. +class AudioQSLInMemory(AudioQSL): + def __init__(self, dataset_dir, manifest_filepath, labels, + sample_rate=16000, perf_count=None): + super().__init__(dataset_dir, manifest_filepath, labels, + sample_rate, perf_count) + super().load_query_samples(range(self.count)) + + def load_query_samples(self, sample_list): + pass + + def unload_query_samples(self, sample_list): + pass diff --git a/benchmarks/rnnt/ootb/inference/README.md b/benchmarks/rnnt/ootb/inference/README.md new file mode 100644 index 0000000..27fbabd --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/README.md @@ -0,0 +1,116 @@ +# 1. Problem +Speech recognition accepts raw audio samples and produces a corresponding +character transcription, without an external language model. + +# 2. Directions + +Open `run.sh`. Set the stage variable to "-1". Set "work_dir" to a +path backed by a disk with at least 30 GB of space. Most space is used +by loadgen logs, not the data or model. You need conda and a C/C++ +compiler on your PATH. I used conda 4.8.2. This script is responsible +for downloading dependencies, data, and the model. + +Run `./run.sh` from this directory. Note that stage 3 runs all of the +scenarios for the reference implementation, which will take a long +time, so you may want to exist before then. + +As you complete individual stages, you can set the variable "stage" to +a higher number for restarting from a later stage. + +# 3. Dataset/Environment +### Publication/Attribution +["OpenSLR LibriSpeech Corpus"](http://www.openslr.org/12/) provides over 1000 hours of speech data in the form of raw audio. +We use dev-clean, which is approximately 5 hours. We remove all samples with a length exceeding 15 seconds. + +### Data preprocessing +Log filterbanks of size 80 are extracted every 10 milliseconds, from +windows of size 20 milliseconds. Note that every three filterbanks are +concatenated together ("feature splicing"), so the model's effective +frame rate is actually 30 milliseconds. + +No dithering takes place. + +This is not typical preprocessing, since it takes place as part of the +model's measured runtime, not before the model runs. + +### Test data order + +Look at dev-clean-wav.json generated by run.sh. It looks like this: + +``` +[ + { + "files": [ + { + "channels": 1, + "sample_rate": 16000.0, + "bitrate": 16, + "duration": 6.59, + "num_samples": 105440, + "encoding": "Signed Integer PCM", + "silent": false, + "fname": "dev-clean-wav/2277/149896/2277-149896-0000.wav", + "speed": 1 + } + ], + "original_duration": 6.59, + "original_num_samples": 105440, + "transcript": "he was in a fevered state of mind owing to the blight his wife's action threatened to cast upon his entire future" + }, + { + "files": [ + { + "channels": 1, + "sample_rate": 16000.0, + "bitrate": 16, + "duration": 7.145, + "num_samples": 114320, + "encoding": "Signed Integer PCM", + "silent": false, + "fname": "dev-clean-wav/2277/149896/2277-149896-0001.wav", + "speed": 1 + } + ], + "original_duration": 7.145, + "original_num_samples": 114320, + "transcript": "he would have to pay her the money which she would now regularly demand or there would be trouble it did not matter what he did" + }, + ... +] +``` + +The data is loaded into memory. Then all samples with a duration above +15 seconds are filtered out. Then the first object in the array is +assigned query id 0, the second is assigned query id 1, etc. The +unfiltered file is uploaded to the directory containing README in case +you do not want to recreate this file. + +# 4. Model +This is a variant of the model described in sections 3.1 and 6.2 of: + +@article{, + title={STREAMING END-TO-END SPEECH RECOGNITION FOR MOBILE DEVICES}, + author={Yanzhang He, Tara N. Sainath, Rohit Prabhavalkar, Ian McGraw, Raziel Alvarez, Ding Zhao, + David Rybach, Anjuli Kannan, Yonghui Wu, Ruoming Pang, Qiao Liang, Deepti Bhatia, Yuan Shangguan, + Bo Li, Golan Pundak, Khe Chai Sim, Tom Bagby, Shuo-yiin Chang, Kanishka Rao, Alexander Gruenstein}, + journal={arXiv preprint arXiv:1811.06621}, + year={2018} +} + +The differences are as follows: + +1. The model has 45.3 million parameters, rather than 120 million parameters +1. The LSTMs are not followed by projection layers +1. No layer normalization is used +1. Hidden dimensions are smaller. +1. The prediction network is made of two LSTMs, rather than seven. +1. The labels are characters, rather than word pieces. +1. No quantization is done at this time for inference. +1. A greedy decoder is used, rather than a beamsearch decoder. This greatly + reduces inference complexity. + +# 5. Quality +### Quality metric +7.452253714852645% Word Error Rate (WER) across all words in the output text of +all samples less than 15 seconds in length in the dev-clean set, using a greedy +decoder and a fully FP32 model. \ No newline at end of file diff --git a/benchmarks/rnnt/ootb/inference/accuracy_eval.py b/benchmarks/rnnt/ootb/inference/accuracy_eval.py new file mode 100644 index 0000000..ea81792 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/accuracy_eval.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +import argparse +import array +import json +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pytorch")) + +from QSL import AudioQSL +from helpers import process_evaluation_epoch, __gather_predictions +from parts.manifest import Manifest + +dtype_map = { + "int8": 'b', + "int16": 'h', + "int32": 'l', + "int64": 'q', +} + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", required=True) + parser.add_argument("--dataset_dir", required=True) + parser.add_argument("--manifest", required=True) + parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type") + args = parser.parse_args() + return args + +def main(): + args = get_args() + labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] + qsl = AudioQSL(args.dataset_dir, args.manifest, labels) + manifest = qsl.manifest + with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh: + results = json.load(fh) + hypotheses = [] + references = [] + for result in results: + hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()) + references.append(manifest[result["qsl_idx"]]["transcript"]) + + references = __gather_predictions([references], labels=labels) + hypotheses = __gather_predictions([hypotheses], labels=labels) + + d = dict(predictions=hypotheses, + transcripts=references) + wer = process_evaluation_epoch(d) + print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100)) + +if __name__ == '__main__': + main() diff --git a/benchmarks/rnnt/ootb/inference/environment.yml b/benchmarks/rnnt/ootb/inference/environment.yml new file mode 100644 index 0000000..4958247 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/environment.yml @@ -0,0 +1,128 @@ +name: mlperf-rnnt +channels: + - pytorch + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - absl-py=0.9.0=py36_0 + - blas=1.0=mkl + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2020.4.5.1=hecc5488_0 + - certifi=2020.4.5.1=py36h9f0ad1d_0 + - cffi=1.14.0=py36h2e261b9_0 + - cmake=3.14.0=h52cb24c_0 + - cudatoolkit=10.1.243=h6bb024c_0 + - cudatoolkit-dev=10.1.243=h516909a_3 + - expat=2.2.6=he6710b0_0 + - freetype=2.9.1=h8a8886c_1 + - gdb=8.3.1=py36h497da48_1 + - intel-openmp=2020.0=166 + - jpeg=9b=h024ee3a_2 + - krb5=1.17.1=h173b8e3_0 + - lame=3.100=h14c3975_1001 + - ld_impl_linux-64=2.33.1=h53a641e_7 + - libcurl=7.69.1=h20c2e04_0 + - libedit=3.1.20181209=hc058e9b_0 + - libffi=3.2.1=hd88cf55_4 + - libgcc-ng=9.1.0=hdf63c60_0 + - libgfortran-ng=7.3.0=hdf63c60_0 + - libpng=1.6.37=hbc83047_0 + - libssh2=1.9.0=h1ba5d50_1 + - libstdcxx-ng=9.1.0=hdf63c60_0 + - libtiff=4.1.0=h2733197_0 + - mad=0.15.1b=he1b5a44_0 + - mkl=2020.0=166 + - mkl-include=2020.0=166 + - mkl-service=2.3.0=py36he904b0f_0 + - mkl_fft=1.0.15=py36ha843d7b_0 + - mkl_random=1.1.0=py36hd6b4f25_0 + - ncurses=6.1=hf484d3e_1002 + - ninja=1.9.0=py36hfd86e86_0 + - numpy=1.18.1=py36h4f9e942_0 + - numpy-base=1.18.1=py36hde5b4d6_1 + - olefile=0.46=py_0 + - openssl=1.1.1g=h516909a_0 + - pillow=7.0.0=py36hb39fc2d_0 + - pip=20.0.2=py36_1 + - pycparser=2.20=py_0 + - python=3.6.10=h0371630_0 + - python_abi=3.6=1_cp36m + - pytorch=1.5.0=py3.6_cuda10.1.243_cudnn7.6.3_0 + - pyyaml=5.3.1=py36h7b6447c_0 + - readline=7.0=hf8c457e_1001 + - rhash=1.3.8=h1ba5d50_0 + - setuptools=46.1.3=py36_0 + - six=1.14.0=py36_0 + - sqlite=3.31.1=h7b6447c_0 + - tk=8.6.8=hbc83047_0 + - torchvision=0.6.0=py36_cu101 + - wheel=0.34.2=py36_0 + - xz=5.2.4=h14c3975_4 + - yaml=0.1.7=had09818_2 + - zlib=1.2.11=h7b6447c_3 + - zstd=1.3.7=h0b5b093_0 + - pip: + - ascii-graph==1.5.1 + - attrs==19.3.0 + - audioread==2.1.8 + - autopep8==1.5.1 + - backcall==0.1.0 + - chardet==3.0.4 + - coverage==5.0.4 + - decorator==4.4.2 + - entrypoints==0.3 + - flake8==3.7.9 + - grpcio==1.28.1 + - idna==2.9 + - importlib-metadata==1.6.0 + - inflect==4.1.0 + - ipdb==0.13.2 + - ipython==7.13.0 + - ipython-genutils==0.2.0 + - jedi==0.16.0 + - joblib==0.14.1 + - librosa==0.7.2 + - llvmlite==0.31.0 + - markdown==3.2.1 + - mccabe==0.6.1 + - more-itertools==8.2.0 + - numba==0.48.0 + - onnx==1.6.0 + - onnxruntime==1.2.0 + - packaging==20.3 + - pandas==0.24.2 + - parso==0.6.2 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - pluggy==0.13.1 + - prompt-toolkit==3.0.5 + - protobuf==3.11.3 + - ptyprocess==0.6.0 + - py==1.8.1 + - pycodestyle==2.5.0 + - pyflakes==2.1.1 + - pygments==2.6.1 + - pyparsing==2.4.7 + - pytest==5.4.2 + - python-dateutil==2.8.1 + - pytz==2019.3 + - requests==2.23.0 + - resampy==0.2.2 + - scikit-learn==0.22.2.post1 + - scipy==1.4.1 + - soundfile==0.10.3.post1 + - sox==1.3.7 + - tensorboard==2.0.0 + - toml==0.10.0 + - tqdm==4.31.1 + - traitlets==4.3.3 + - typing-extensions==3.7.4.2 + - unidecode==1.1.1 + - urllib3==1.25.8 + - wcwidth==0.1.9 + - werkzeug==1.0.1 + - wrapt==1.10.11 + - zipp==3.1.0 +prefix: /cb/home/daniel/ws/miniconda3/envs/mlperf-rnnt + diff --git a/benchmarks/rnnt/ootb/inference/loadgen/.clang-format b/benchmarks/rnnt/ootb/inference/loadgen/.clang-format new file mode 100644 index 0000000..f08c9c2 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: Google +Standard: Cpp11 diff --git a/benchmarks/rnnt/ootb/inference/loadgen/CMakeLists.txt b/benchmarks/rnnt/ootb/inference/loadgen/CMakeLists.txt new file mode 100644 index 0000000..7865287 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/CMakeLists.txt @@ -0,0 +1,68 @@ +cmake_minimum_required(VERSION 3.1) + +project(mlperf_loadgen) + +# The mlperf_loadgen version. +set(mlperf_loadgen_VERSION_MAJOR 1) +set(mlperf_loadgen_VERSION_MINOR 1) +message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}") + +# Set build options. NB: CXX_STANDARD is supported since CMake 3.1. +if (NOT MSVC) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall") +endif() +message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}") +set(CMAKE_CXX_STANDARD "14") +message(STATUS "Using C++ standard: ${CMAKE_CXX_STANDARD}") +message(STATUS "Using static linker flags: ${CMAKE_STATIC_LINKER_FLAGS}") +message(STATUS "Using shared linker flags: ${CMAKE_SHARED_LINKER_FLAGS}") + +# Output directory for libraries. +set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) +message(STATUS "Using output path: ${LIBRARY_OUTPUT_PATH}") + +# Detect Python to use for generating source file with version info. +# NB: PythonInterp has been deprecated since CMake 3.12 +# but it works with earlier versions of CMake. +find_package(PythonInterp) +message(STATUS "Using Python interpreter: ${PYTHON_EXECUTABLE}") + +# Generate source file with version info. +execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/version_generator.py ${CMAKE_BINARY_DIR}/version_generated.cc ${CMAKE_CURRENT_SOURCE_DIR}) + +# Add source files. +set(SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.h + ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.cc + ${CMAKE_CURRENT_SOURCE_DIR}/issue_query_controller.cc + ${CMAKE_CURRENT_SOURCE_DIR}/loadgen.cc + ${CMAKE_CURRENT_SOURCE_DIR}/logging.cc + ${CMAKE_CURRENT_SOURCE_DIR}/logging.h + ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.cc + ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/utils.h + ${CMAKE_CURRENT_SOURCE_DIR}/version.cc + ${CMAKE_CURRENT_SOURCE_DIR}/version.h + ${CMAKE_BINARY_DIR}/version_generated.cc +) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +add_library(mlperf_loadgen STATIC ${SOURCE}) +target_link_libraries(mlperf_loadgen) + +if(WIN32) +set (LIBS "") +else() +set (LIBS pthread) +endif() + +add_executable(benchmark benchmark/repro.cpp) +target_link_libraries(benchmark PUBLIC mlperf_loadgen ${LIBS}) + +# Install library and headers. +install(TARGETS mlperf_loadgen + DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h") diff --git a/benchmarks/rnnt/ootb/inference/loadgen/README.md b/benchmarks/rnnt/ootb/inference/loadgen/README.md new file mode 100644 index 0000000..e5329a1 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/README.md @@ -0,0 +1,105 @@ +# Overview {#mainpage} + +*Note:* A compiled html version of this document is hosted online +[here](https://mlperf.github.io/inference/loadgen/index.html). + +## Introduction + +* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures + the performance of inference systems. +* It generates traffic for scenarios as formulated by a diverse set of experts + in the [MLPerf working group](https://mlperf.org/about). +* The scenarios emulate the workloads seen in mobile devices, + autonomous vehicles, robotics, and cloud-based setups. +* Although the LoadGen is not model or dataset aware, its strength is in its + reusability with logic that is. + +## Integration Example and Flow +The following is an diagram of how the LoadGen can be integrated into an +inference system, resembling how some of the MLPerf reference models are +implemented. +
+ +
    +
  1. Benchmark knows the model, dataset, and preprocessing.
  2. +
  3. Benchmark hands dataset sample IDs to LoadGen.
  4. +
  5. LoadGen starts generating queries of sample IDs.
  6. +
  7. Benchmark creates requests to backend.
  8. +
  9. Result is post processed and forwarded to LoadGen.
  10. +
  11. LoadGen outputs logs for analysis.
    +
+
+ +## Useful Links +* [FAQ](@ref ReadmeFAQ) +* [LoadGen Build Instructions](@ref ReadmeBuild) +* [LoadGen API](@ref LoadgenAPI) +* [Test Settings](@ref LoadgenAPITestSettings) - + A good description of available scenarios, modes, and knobs. +* [MLPerf Inference Code](https://github.com/mlcommons/inference) - + Includes source for the LoadGen and reference models that use the LoadGen. +* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) - + Any mismatch with this is a bug in the LoadGen. +* [MLPerf Website](www.mlperf.org) + +## Scope of the LoadGen's Responsibilities + +### In Scope +* **Provide a reusable** C++ library with python bindings. +* **Implement** the traffic patterns of the MLPerf Inference scenarios and + modes. +* **Record** all traffic generated and received for later analysis and + verification. +* **Summarize** the results and whether performance constraints were met. +* **Target high-performance** systems with efficient multi-thread friendly + logging utilities. +* **Generate trust** via a shared, well-tested, and community-hardened + code base. + +### Out of Scope +The LoadGen is: +* **NOT** aware of the ML model it is running against. +* **NOT** aware of the data formats of the model's inputs and outputs. +* **NOT** aware of how to score the accuracy of a model's outputs. +* **NOT** aware of MLPerf rules regarding scenario-specific constraints. + +Limitting the scope of the LoadGen in this way keeps it reusable across +different models and datasets without modification. Using composition and +dependency injection, the user can define their own model, datasets, and +metrics. + +Additionally, not hardcoding MLPerf-specific test constraints, like test +duration and performance targets, allows users to use the LoadGen unmodified +for custom testing and continuous integration purposes. + +## Submission Considerations + +### Upstream all local modifications +* As a rule, no local modifications to the LoadGen's C++ library are allowed +for submission. +* Please upstream early and often to keep the playing field level. + +### Choose your TestSettings carefully! +* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf +requirements for submission. *e.g.:* target percentiles and latencies. +* For verification, the values in TestSettings are logged. +* To help make sure your settings are spec compliant, use +TestSettings::FromConfig in conjunction with the relevant config file provided +with the reference models. + +## Responsibilities of a LoadGen User + +### Implement the Interfaces +* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass + them to the StartTest function. +* Call QuerySampleComplete for every sample received by + SystemUnderTest::IssueQuery. + +### Assess Accuracy +* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine + the accuracy of your system. +* For the official models, Python scripts will be provided by the MLPerf model + owners for you to do this automatically. + +For templates of how to do the above in detail, refer to code for the demos, +tests, and reference models. diff --git a/benchmarks/rnnt/ootb/inference/loadgen/README_BUILD.md b/benchmarks/rnnt/ootb/inference/loadgen/README_BUILD.md new file mode 100644 index 0000000..095a8d8 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/README_BUILD.md @@ -0,0 +1,32 @@ +# Building the LoadGen {#ReadmeBuild} + +## Prerequisites + + sudo apt-get install libglib2.0-dev python-pip python3-pip + pip2 install absl-py numpy + pip3 install absl-py numpy + +## Quick Start + + pip install absl-py numpy + git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference + cd mlperf_inference/loadgen + CFLAGS="-std=c++14 -O3" python setup.py bdist_wheel + pip install --force-reinstall dist/mlperf_loadgen-0.5a0-cp36-cp36m-linux_x86_64.whl + python demos/py_demo_single_stream.py + +This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo. The exact *.whl filename may differ on your system, but there should only be one resulting whl file for you to use. + +A summary of the test results can be found in the *"mlperf_log_summary.txt"* logfile. + +For a timeline visualization of what happened during the test, open the *"mlperf_log_trace.json"* file in Chrome: +* Type “chrome://tracing” in the address bar, then drag-n-drop the json. +* This may be useful for SUT performance tuning and understanding + debugging the loadgen. + +To build the loadgen as a C++ library, rather than a python module: + + git clone https://github.com/mlcommons/inference.git mlperf_inference + cd mlperf_inference + mkdir loadgen/build/ && cd loadgen/build/ + cmake .. && cmake --build . + cp libmlperf_loadgen.a .. diff --git a/benchmarks/rnnt/ootb/inference/loadgen/README_FAQ.md b/benchmarks/rnnt/ootb/inference/loadgen/README_FAQ.md new file mode 100644 index 0000000..c1093a6 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/README_FAQ.md @@ -0,0 +1,88 @@ +# LoadGen FAQ {#ReadmeFAQ} + +## Q: The LoadGen does not match the MLPerf specification. Who is right? +**A:** +The MLPerf spec is *always* right. +Please file a LoadGen bug so it may be resolved. + +## Q: How can I file a bug? +**A:** +On GitHub: https://github.com/mlcommons/inference/issues/new + +## Q: Can I make local modifications to the LoadGen for submission? +**A:** +No. To keep the playing field level, please upstream any local +modificiations you need to make. Ideally upstream such changes behind a runtime +flag or via an abstract interface the client can implement. This will help +with testability. + +## Q: Where can I find the results of a test? +**A:** +By default, the loadgen will output an *mlperf_log_summary.txt* file +that summarizes the target metrics and constraints of the test, along with +other stats about the run. + +*Note:* LogSettings also has a flag to forward the results to stdout and +there's an outstanding TODO to make this more programmable. + +## Q: The reference implementation for \<*some_model*\> prints out results of its own. Are those for submission? +**A:** +They are not. The LoadGen results are the ground truth for submission +results since they will work even for systems that forgo the python bindings. +If you notice a bug in the LoadGen's results, please file a bug or submit a +patch. + +## Q: I'm getting linker errors for LoadgenVersion definitions. Where is *version_generated.cc*? +**A:** +If you have a custom build setup, make sure you run the *version_generator.py* +script, which will create the cc file you are looking for. The official build +files that come with the LoadGen do this for you out of the box. + +## Q: What is this *version_generator.py* script? +**A:** +The LoadGen records git stats (if available) and the SHA1 of all its +source files (always) at build time for verification purposes. This is easy +to circumvent, but try your best to run *version_generator.py* correctly; +ideally integrated with your build system if you have a custom build. +The intention is more to help with debugging efforts and detect accidental +version missmatches than to detect bad actors. + +## Q: How do I view the *mlperf_log_trace.json* file? +**A:** +This file uses the [Trace Event Format] +(https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit) +to record a timeline of all the threads involved. +You can view the file by typing [chrome://tracing](chrome://tracing) into +Chrome's address bar and dragging the json file there. +This file zips well and you can drag the zip file directly into +[chrome://tracing](chrome://tracing) too. +Please include zipped traces (and the other logs) when filing bug reports. + +## Q: What is the difference between the MultiStream and MultiStreamFree scenarios? +**A:** +MultiStream corresponds to the official MLPerf scenario for submissions; +it has a fixed query rate and allows only one outstanding query at a time. +MultiStreamFree is implemented for evaluation purposes only; it sends queries +as fast as possible and allows up to N outstanding queries at a time. You may +want to use MultiStreamFree for development purposes since small improvements +in performance will always be reflected in the results, whereas MultiStream's +results will be quantized. + +## Q: Why is the code littered with so many lambdas? My eyes hurt. +**A:** +Lambdas are a convenient and efficient way to ship arbitrary data + deferred +logic over to the logging thread without much boilerplate. +Much of the loadgen is built on top of the logging utilities. +Thus the lambdas. (Sorry about the eyes.) + +## Q: What C++ version does the LoadGen target? +**A:** +It currently targets and requires C++14. It should compile with recent +versions of clang, gcc, and msvc. + +## Q: What dependencies does the LoadGen code have? +**A:** +The C++ code has no external dependencies. The loadgen itself, logging +utilities, and unit test utilities are built solely on the C++ Standard Library. +The python bindings, however, do require +[pybind11](https://github.com/pybind/pybind11). diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/.gitignore b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/.gitignore new file mode 100644 index 0000000..e792c8e --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/.gitignore @@ -0,0 +1,2 @@ +loadgen_build +build \ No newline at end of file diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/README.md b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/README.md new file mode 100644 index 0000000..24e8729 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/README.md @@ -0,0 +1,10 @@ +Note: please install jemalloc first. See: http://jemalloc.net/ +Command: bash run.sh <0=Basic,1=Queue> + +Experiments: +- On Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz +- Basic SUT : 500-600k i/s +- Basic SUT + jemalloc: 800-900k i/s (`bash run.sh 800000 0`) +- Queued SUT (2 complete threads) + jemalloc: 1.2-1.3M i/s (`bash run.sh 1200000 1 2 2048`) +- Queued SUT (2 complete threads) + jemalloc + server_coalesce_queries: 1.4-1.5M is/ (`bash run.sh 1400000 1 2 512 1`) +- Basic SUT + jemalloc + server_coalesce_queries + 4 IssueQueryThreads: 2.4-2.5M is/ (`bash run.sh 2400000 0 2 512 1 4`) diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/repro.cpp b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/repro.cpp new file mode 100644 index 0000000..8b4bc8a --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/repro.cpp @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "loadgen.h" +#include "query_sample_library.h" +#include "system_under_test.h" +#include "test_settings.h" + +class QSL : public mlperf::QuerySampleLibrary { + public: + ~QSL() override{}; + const std::string& Name() const override { return mName; } + size_t TotalSampleCount() override { return 1000000; } + size_t PerformanceSampleCount() override { return TotalSampleCount(); } + void LoadSamplesToRam( + const std::vector& samples) override {} + void UnloadSamplesFromRam( + const std::vector& samples) override {} + + private: + std::string mName{"Dummy QSL"}; +}; + +class BasicSUT : public mlperf::SystemUnderTest { + public: + BasicSUT() { + // Start with some large value so that we don't reallocate memory. + initResponse(10000); + } + ~BasicSUT() override {} + const std::string& Name() const override { return mName; } + void IssueQuery(const std::vector& samples) override { + int n = samples.size(); + if (n > mResponses.size()) { + std::cerr << "Warning: reallocating response buffer in BasicSUT. Maybe " + "you should initResponse with larger value!?" + << std::endl; + initResponse(samples.size()); + } + for (int i = 0; i < n; i++) { + mResponses[i].id = samples[i].id; + } + mlperf::QuerySamplesComplete(mResponses.data(), n); + } + void FlushQueries() override {} + void ReportLatencyResults( + const std::vector& latencies_ns) override{}; + + private: + void initResponse(int size) { + mResponses.resize(size, + {0, reinterpret_cast(&mBuf), sizeof(int)}); + } + int mBuf{0}; + std::string mName{"BasicSUT"}; + std::vector mResponses; +}; + +class QueueSUT : public mlperf::SystemUnderTest { + public: + QueueSUT(int numCompleteThreads, int maxSize) { + // Each thread handle at most maxSize at a time. + std::cout << "QueueSUT: maxSize = " << maxSize << std::endl; + initResponse(numCompleteThreads, maxSize); + // Launch complete threads + for (int i = 0; i < numCompleteThreads; i++) { + mThreads.emplace_back(&QueueSUT::CompleteThread, this, i); + } + } + ~QueueSUT() override { + { + std::unique_lock lck(mMtx); + mDone = true; + mCondVar.notify_all(); + } + for (auto& thread : mThreads) { + thread.join(); + } + } + const std::string& Name() const override { return mName; } + void IssueQuery(const std::vector& samples) override { + std::unique_lock lck(mMtx); + for (const auto& sample : samples) { + mIdQueue.push_back(sample.id); + } + // Let some worker thread to consume tasks + mCondVar.notify_one(); + } + void FlushQueries() override {} + void ReportLatencyResults( + const std::vector& latencies_ns) override{}; + + private: + void CompleteThread(int threadIdx) { + auto& responses = mResponses[threadIdx]; + size_t maxSize{responses.size()}; + size_t actualSize{0}; + while (true) { + { + std::unique_lock lck(mMtx); + mCondVar.wait(lck, [&]() { return !mIdQueue.empty() || mDone; }); + + if (mDone) { + break; + } + + actualSize = std::min(maxSize, mIdQueue.size()); + for (int i = 0; i < actualSize; i++) { + responses[i].id = mIdQueue.front(); + mIdQueue.pop_front(); + } + mCondVar.notify_one(); + } + mlperf::QuerySamplesComplete(responses.data(), actualSize); + } + } + void initResponse(int numCompleteThreads, int size) { + mResponses.resize(numCompleteThreads); + for (auto& responses : mResponses) { + responses.resize(size, + {0, reinterpret_cast(&mBuf), sizeof(int)}); + } + } + int mBuf{0}; + std::string mName{"QueueSUT"}; + std::vector> mResponses; + std::vector mThreads; + std::deque mIdQueue; + std::mutex mMtx; + std::condition_variable mCondVar; + bool mDone{false}; +}; + +class MultiBasicSUT : public mlperf::SystemUnderTest { + public: + MultiBasicSUT(int numThreads) + : mNumThreads(numThreads), mResponses(numThreads) { + // Start with some large value so that we don't reallocate memory. + initResponse(10000); + for (int i = 0; i < mNumThreads; ++i) { + mThreads.emplace_back(&MultiBasicSUT::startIssueThread, this, i); + } + } + ~MultiBasicSUT() override { + for (auto& thread : mThreads) { + thread.join(); + } + } + const std::string& Name() const override { return mName; } + void IssueQuery(const std::vector& samples) override { + int thread_idx = mThreadMap[std::this_thread::get_id()]; + int n = samples.size(); + auto& reponses = mResponses[thread_idx]; + if (n > reponses.size()) { + std::cout + << "Warning: reallocating response buffer in MultiBasicSUT. Maybe " + "you should initResponse with larger value!?" + << std::endl; + initResponse(samples.size()); + } + for (int i = 0; i < n; i++) { + reponses[i].id = samples[i].id; + } + mlperf::QuerySamplesComplete(reponses.data(), n); + } + void FlushQueries() override {} + void ReportLatencyResults( + const std::vector& latencies_ns) override{}; + + private: + void initResponse(int size) { + for (auto& responses : mResponses) { + responses.resize(size, + {0, reinterpret_cast(&mBuf), sizeof(int)}); + } + } + void startIssueThread(int thread_idx) { + { + std::lock_guard lock(mMtx); + mThreadMap[std::this_thread::get_id()] = thread_idx; + } + mlperf::RegisterIssueQueryThread(); + } + int mBuf{0}; + int mNumThreads{0}; + std::string mName{"MultiBasicSUT"}; + std::vector> mResponses; + std::mutex mMtx; + std::vector mThreads; + std::map mThreadMap; +}; + +int main(int argc, char** argv) { + assert(argc >= 2 && "Need to pass in at least one argument: target_qps"); + int target_qps = std::stoi(argv[1]); + std::cout << "target_qps = " << target_qps << std::endl; + + bool useQueue{false}; + int numCompleteThreads{4}; + int maxSize{1}; + bool server_coalesce_queries{false}; + int num_issue_threads{0}; + if (argc >= 3) { + useQueue = std::stoi(argv[2]) != 0; + } + if (argc >= 4) { + numCompleteThreads = std::stoi(argv[3]); + } + if (argc >= 5) { + maxSize = std::stoi(argv[4]); + } + if (argc >= 6) { + server_coalesce_queries = std::stoi(argv[5]) != 0; + } + if (argc >= 7) { + num_issue_threads = std::stoi(argv[6]); + } + + QSL qsl; + std::unique_ptr sut; + + // Configure the test settings + mlperf::TestSettings testSettings; + testSettings.scenario = mlperf::TestScenario::Server; + testSettings.mode = mlperf::TestMode::PerformanceOnly; + testSettings.server_target_qps = target_qps; + testSettings.server_target_latency_ns = 10000000; // 10ms + testSettings.server_target_latency_percentile = 0.99; + testSettings.min_duration_ms = 60000; + testSettings.min_query_count = 270000; + testSettings.server_coalesce_queries = server_coalesce_queries; + std::cout << "testSettings.server_coalesce_queries = " + << (server_coalesce_queries ? "True" : "False") << std::endl; + testSettings.server_num_issue_query_threads = num_issue_threads; + std::cout << "num_issue_threads = " << num_issue_threads << std::endl; + + // Configure the logging settings + mlperf::LogSettings logSettings; + logSettings.log_output.outdir = "build"; + logSettings.log_output.prefix = "mlperf_log_"; + logSettings.log_output.suffix = ""; + logSettings.log_output.prefix_with_datetime = false; + logSettings.log_output.copy_detail_to_stdout = false; + logSettings.log_output.copy_summary_to_stdout = true; + logSettings.log_mode = mlperf::LoggingMode::AsyncPoll; + logSettings.log_mode_async_poll_interval_ms = 1000; + logSettings.enable_trace = false; + + // Choose SUT + if (num_issue_threads == 0) { + if (useQueue) { + std::cout << "Using QueueSUT with " << numCompleteThreads + << " complete threads" << std::endl; + sut.reset(new QueueSUT(numCompleteThreads, maxSize)); + } else { + std::cout << "Using BasicSUT" << std::endl; + sut.reset(new BasicSUT()); + } + } else { + if (useQueue) { + std::cout << "Using MultiQueueSUT with " << numCompleteThreads + << " complete threads" << std::endl; + std::cerr << "!!!! MultiQueueSUT is NOT implemented yet !!!!" + << std::endl; + return 1; + // sut.reset(new MultiQueueSUT(num_issue_threads, numCompleteThreads, + // maxSize)); + } else { + std::cout << "Using MultiBasicSUT" << std::endl; + sut.reset(new MultiBasicSUT(num_issue_threads)); + } + } + + // Start test + std::cout << "Start test..." << std::endl; + mlperf::StartTest(sut.get(), &qsl, testSettings, logSettings); + std::cout << "Test done. Clean up SUT..." << std::endl; + sut.reset(); + std::cout << "Done!" << std::endl; + return 0; +} diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run.sh b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run.sh new file mode 100644 index 0000000..62559c1 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run.sh @@ -0,0 +1,21 @@ +#!/usr/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +echo "Building loadgen..." +if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi; +cd loadgen_build && cmake ../.. && make -j && cd .. +echo "Building test program..." +if [ ! -e build ]; then mkdir build; fi; +g++ --std=c++11 -O3 -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \ +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 build/repro.exe $1 $2 $3 $4 $5 $6 diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run_debug.sh b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run_debug.sh new file mode 100644 index 0000000..ba63727 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run_debug.sh @@ -0,0 +1,21 @@ +#!/usr/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +echo "Building loadgen in Debug mode..." +if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi; +cd loadgen_build && cmake -DCMAKE_BUILD_TYPE=Debug ../.. && make -j && cd .. +echo "Building test program in Debug mode..." +if [ ! -e build ]; then mkdir build; fi; +g++ --std=c++11 -O0 -g -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \ +gdb --args build/repro.exe $1 $2 $3 $4 $5 $6 diff --git a/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.cc b/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.cc new file mode 100644 index 0000000..9de41da --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.cc @@ -0,0 +1,168 @@ +/* Copyright 2019 The MLPerf Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "c_api.h" + +#include + +#include "../loadgen.h" +#include "../query_sample.h" +#include "../query_sample_library.h" +#include "../system_under_test.h" +#include "../test_settings.h" + +namespace mlperf { +namespace c { +namespace { + +// Forwards SystemUnderTest calls to relevant callbacks. +class SystemUnderTestTrampoline : public SystemUnderTest { + public: + SystemUnderTestTrampoline( + ClientData client_data, std::string name, IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb, + ReportLatencyResultsCallback report_latency_results_cb) + : client_data_(client_data), + name_(std::move(name)), + issue_cb_(issue_cb), + flush_queries_cb_(flush_queries_cb), + report_latency_results_cb_(report_latency_results_cb) {} + ~SystemUnderTestTrampoline() override = default; + + const std::string& Name() const override { return name_; } + + void IssueQuery(const std::vector& samples) override { + (*issue_cb_)(client_data_, samples.data(), samples.size()); + } + + void FlushQueries() override { (*flush_queries_cb_)(); } + + void ReportLatencyResults( + const std::vector& latencies_ns) override { + (*report_latency_results_cb_)(client_data_, latencies_ns.data(), + latencies_ns.size()); + } + + private: + ClientData client_data_; + std::string name_; + IssueQueryCallback issue_cb_; + FlushQueriesCallback flush_queries_cb_; + ReportLatencyResultsCallback report_latency_results_cb_; +}; + +} // namespace + +void* ConstructSUT(ClientData client_data, const char* name, size_t name_length, + IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb, + ReportLatencyResultsCallback report_latency_results_cb) { + SystemUnderTestTrampoline* sut = new SystemUnderTestTrampoline( + client_data, std::string(name, name_length), issue_cb, flush_queries_cb, + report_latency_results_cb); + return reinterpret_cast(sut); +} + +void DestroySUT(void* sut) { + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + delete sut_cast; +} + +namespace { + +// Forwards QuerySampleLibrary calls to relevant callbacks. +class QuerySampleLibraryTrampoline : public QuerySampleLibrary { + public: + QuerySampleLibraryTrampoline( + ClientData client_data, std::string name, size_t total_sample_count, + size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) + : client_data_(client_data), + name_(std::move(name)), + total_sample_count_(total_sample_count), + performance_sample_count_(performance_sample_count), + load_samples_to_ram_cb_(load_samples_to_ram_cb), + unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {} + ~QuerySampleLibraryTrampoline() override = default; + + const std::string& Name() const override { return name_; } + size_t TotalSampleCount() override { return total_sample_count_; } + size_t PerformanceSampleCount() override { return performance_sample_count_; } + + void LoadSamplesToRam(const std::vector& samples) override { + (*load_samples_to_ram_cb_)(client_data_, samples.data(), samples.size()); + } + void UnloadSamplesFromRam( + const std::vector& samples) override { + (*unload_samples_from_ram_cb_)(client_data_, samples.data(), + samples.size()); + } + + private: + ClientData client_data_; + std::string name_; + size_t total_sample_count_; + size_t performance_sample_count_; + LoadSamplesToRamCallback load_samples_to_ram_cb_; + UnloadSamplesFromRamCallback unload_samples_from_ram_cb_; +}; + +} // namespace + +void* ConstructQSL(ClientData client_data, const char* name, size_t name_length, + size_t total_sample_count, size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) { + QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline( + client_data, std::string(name, name_length), total_sample_count, + performance_sample_count, load_samples_to_ram_cb, + unload_samples_from_ram_cb); + return reinterpret_cast(qsl); +} + +void DestroyQSL(void* qsl) { + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + delete qsl_cast; +} + +// mlperf::c::StartTest just forwards to mlperf::StartTest after doing the +// proper cast. +void StartTest(void* sut, void* qsl, const TestSettings& settings) { + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + LogSettings default_log_settings; + mlperf::StartTest(sut_cast, qsl_cast, settings, default_log_settings); +} + +void QuerySamplesComplete(QuerySampleResponse* responses, + size_t response_count) { + mlperf::QuerySamplesComplete(responses, response_count); +} + +void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses, + size_t response_count, ResponseCallback response_cb, + ClientData client_data) { + mlperf::QuerySamplesComplete(responses, response_count, + [client_data, response_cb] (QuerySampleResponse* response) { + response_cb(client_data, response); + }); +} + +void RegisterIssueQueryThread() { mlperf::RegisterIssueQueryThread(); } + +} // namespace c +} // namespace mlperf diff --git a/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.h b/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.h new file mode 100644 index 0000000..cf1a859 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.h @@ -0,0 +1,90 @@ +/* Copyright 2019 The MLPerf Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +/// \file +/// \brief A C API wrapping the C++ loadgen. Not tested. Needs work. +/// \details The C API allows a C or Python client to easily create +/// a SystemUnderTest without having to expose the SystemUnderTest class +/// directly. +/// ConstructSUT works with a bunch of function poitners instead that are +/// called from an underlying trampoline class. + +#ifndef SYSTEM_UNDER_TEST_C_API_H_ +#define SYSTEM_UNDER_TEST_C_API_H_ + +#include +#include + +#include "../query_sample.h" +#include "../test_settings.h" + +namespace mlperf { + +namespace c { + +/// \brief Optional opaque client data that creators of SUTs and QSLs can have +/// the loadgen pass back to their callback invocations. +/// Helps avoids global variables. +typedef uintptr_t ClientData; + +typedef void (*IssueQueryCallback)(ClientData, const QuerySample*, size_t); +typedef void (*FlushQueriesCallback)(); +typedef void (*ReportLatencyResultsCallback)(ClientData, const int64_t*, + size_t); +typedef void (*ResponseCallback)(ClientData, QuerySampleResponse*); + +/// \brief SUT calls this function to report query result back to loadgen +void QuerySamplesComplete(QuerySampleResponse* responses, + size_t response_count); + +void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses, + size_t response_count, + ResponseCallback response_cb, + ClientData client_data); + +/// \brief Create an opaque SUT pointer based on C callbacks. +void* ConstructSUT(ClientData client_data, const char* name, size_t name_length, + IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb, + ReportLatencyResultsCallback report_latency_results_cb); +/// \brief Destroys the SUT created by ConstructSUT. +void DestroySUT(void* sut); + +typedef void (*LoadSamplesToRamCallback)(ClientData, const QuerySampleIndex*, + size_t); +typedef void (*UnloadSamplesFromRamCallback)(ClientData, + const QuerySampleIndex*, size_t); + +/// \brief Create an opaque QSL pointer based on C callbacks. +void* ConstructQSL(ClientData client_data, const char* name, size_t name_length, + size_t total_sample_count, size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb); +/// \brief Destroys the QSL created by ConsructQSL. +void DestroyQSL(void* qsl); + +/// \brief Run tests on a SUT created by ConstructSUT(). +/// \details This is the C entry point. See mlperf::StartTest for the C++ entry +/// point. +void StartTest(void* sut, void* qsl, const TestSettings& settings); + +/// +/// \brief Register a thread for query issuing in Server scenario. +/// \details This is the C entry point. See mlperf::RegisterIssueQueryThread for the C++ entry +/// point. +/// +void RegisterIssueQueryThread(); + +} // namespace c +} // namespace mlperf + +#endif // SYSTEM_UNDER_TEST_C_API_H_ diff --git a/benchmarks/rnnt/ootb/inference/loadgen/bindings/python_api.cc b/benchmarks/rnnt/ootb/inference/loadgen/bindings/python_api.cc new file mode 100644 index 0000000..140604e --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/bindings/python_api.cc @@ -0,0 +1,397 @@ +/* Copyright 2019 The MLPerf Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +/// \file +/// \brief Python bindings for the loadgen using pybind11. + +#ifndef PYTHON_BINDINGS_H +#define PYTHON_BINDINGS_H + +#include + +#include "../loadgen.h" +#include "../query_sample.h" +#include "../query_sample_library.h" +#include "../system_under_test.h" +#include "../test_settings.h" +#include "pybind11/functional.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include "pybind11/stl_bind.h" + +namespace mlperf { + +namespace { + +using IssueQueryCallback = std::function)>; +using FastIssueQueriesCallback = + std::function, std::vector)>; +using FlushQueriesCallback = std::function; +using ReportLatencyResultsCallback = std::function)>; + +// Forwards SystemUnderTest calls to relevant callbacks. +class SystemUnderTestTrampoline : public SystemUnderTest { + public: + SystemUnderTestTrampoline( + std::string name, IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb, + ReportLatencyResultsCallback report_latency_results_cb) + : name_(std::move(name)), + issue_cb_(issue_cb), + flush_queries_cb_(flush_queries_cb), + report_latency_results_cb_(report_latency_results_cb) {} + ~SystemUnderTestTrampoline() override = default; + + const std::string& Name() const override { return name_; } + + void IssueQuery(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + issue_cb_(samples); + } + + void FlushQueries() override { flush_queries_cb_(); } + + void ReportLatencyResults( + const std::vector& latencies_ns) override { + pybind11::gil_scoped_acquire gil_acquirer; + report_latency_results_cb_(latencies_ns); + } + + protected: + std::string name_; + IssueQueryCallback issue_cb_; + FlushQueriesCallback flush_queries_cb_; + ReportLatencyResultsCallback report_latency_results_cb_; +}; + +class FastSystemUnderTestTrampoline : public SystemUnderTestTrampoline { + public: + FastSystemUnderTestTrampoline( + std::string name, FastIssueQueriesCallback fast_issue_cb, + FlushQueriesCallback flush_queries_cb, + ReportLatencyResultsCallback report_latency_results_cb) + : SystemUnderTestTrampoline(name, nullptr, flush_queries_cb, + report_latency_results_cb), + fast_issue_cb_(fast_issue_cb) {} + ~FastSystemUnderTestTrampoline() override = default; + + void IssueQuery(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + std::vector responseIds; + std::vector querySampleIndices; + for (auto& s : samples) { + responseIds.push_back(s.id); + querySampleIndices.push_back(s.index); + } + fast_issue_cb_(responseIds, querySampleIndices); + } + + private: + FastIssueQueriesCallback fast_issue_cb_; +}; + +using LoadSamplesToRamCallback = + std::function)>; +using UnloadSamplesFromRamCallback = + std::function)>; + +// Forwards QuerySampleLibrary calls to relevant callbacks. +class QuerySampleLibraryTrampoline : public QuerySampleLibrary { + public: + QuerySampleLibraryTrampoline( + std::string name, size_t total_sample_count, + size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) + : name_(std::move(name)), + total_sample_count_(total_sample_count), + performance_sample_count_(performance_sample_count), + load_samples_to_ram_cb_(load_samples_to_ram_cb), + unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {} + ~QuerySampleLibraryTrampoline() override = default; + + const std::string& Name() const override { return name_; } + size_t TotalSampleCount() { return total_sample_count_; } + size_t PerformanceSampleCount() { return performance_sample_count_; } + + void LoadSamplesToRam(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + load_samples_to_ram_cb_(samples); + } + void UnloadSamplesFromRam( + const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + unload_samples_from_ram_cb_(samples); + } + + private: + std::string name_; + size_t total_sample_count_; + size_t performance_sample_count_; + LoadSamplesToRamCallback load_samples_to_ram_cb_; + UnloadSamplesFromRamCallback unload_samples_from_ram_cb_; +}; + +} // namespace + +/// \brief Python bindings. +namespace py { + +uintptr_t ConstructSUT(IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb, + ReportLatencyResultsCallback report_latency_results_cb) { + SystemUnderTestTrampoline* sut = new SystemUnderTestTrampoline( + "PySUT", issue_cb, flush_queries_cb, report_latency_results_cb); + return reinterpret_cast(sut); +} + +void DestroySUT(uintptr_t sut) { + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + delete sut_cast; +} + +uintptr_t ConstructFastSUT( + FastIssueQueriesCallback fast_issue_cb, + FlushQueriesCallback flush_queries_cb, + ReportLatencyResultsCallback report_latency_results_cb) { + FastSystemUnderTestTrampoline* sut = new FastSystemUnderTestTrampoline( + "PyFastSUT", fast_issue_cb, flush_queries_cb, report_latency_results_cb); + return reinterpret_cast(sut); +} + +void DestroyFastSUT(uintptr_t sut) { + FastSystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + delete sut_cast; +} + + +uintptr_t ConstructQSL( + size_t total_sample_count, size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) { + QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline( + "PyQSL", total_sample_count, performance_sample_count, + load_samples_to_ram_cb, unload_samples_from_ram_cb); + return reinterpret_cast(qsl); +} + +void DestroyQSL(uintptr_t qsl) { + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + delete qsl_cast; +} + +void StartTest(uintptr_t sut, uintptr_t qsl, + mlperf::TestSettings test_settings) { + pybind11::gil_scoped_release gil_releaser; + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + LogSettings default_log_settings; + mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings); +} + +void StartTestWithLogSettings(uintptr_t sut, uintptr_t qsl, + mlperf::TestSettings test_settings, + mlperf::LogSettings log_settings) { + pybind11::gil_scoped_release gil_releaser; + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + mlperf::StartTest(sut_cast, qsl_cast, test_settings, log_settings); +} + +using ResponseCallback = std::function; + +/// TODO: Get rid of copies. +void QuerySamplesComplete(std::vector responses, ResponseCallback response_cb = {}) { + pybind11::gil_scoped_release gil_releaser; + mlperf::QuerySamplesComplete(responses.data(), responses.size(), response_cb); +} + +PYBIND11_MODULE(mlperf_loadgen, m) { + m.doc() = "MLPerf Inference load generator."; + + pybind11::enum_(m, "TestScenario") + .value("SingleStream", TestScenario::SingleStream) + .value("MultiStream", TestScenario::MultiStream) + .value("MultiStreamFree", TestScenario::MultiStreamFree) + .value("Server", TestScenario::Server) + .value("Offline", TestScenario::Offline); + + pybind11::enum_(m, "TestMode") + .value("SubmissionRun", TestMode::SubmissionRun) + .value("AccuracyOnly", TestMode::AccuracyOnly) + .value("PerformanceOnly", TestMode::PerformanceOnly) + .value("FindPeakPerformance", TestMode::FindPeakPerformance); + + pybind11::class_(m, "TestSettings") + .def(pybind11::init<>()) + .def_readwrite("scenario", &TestSettings::scenario) + .def_readwrite("mode", &TestSettings::mode) + .def_readwrite("single_stream_expected_latency_ns", + &TestSettings::single_stream_expected_latency_ns) + .def_readwrite("single_stream_target_latency_percentile", + &TestSettings::single_stream_target_latency_percentile) + .def_readwrite("multi_stream_target_qps", + &TestSettings::multi_stream_target_qps) + .def_readwrite("multi_stream_target_latency_ns", + &TestSettings::multi_stream_target_latency_ns) + .def_readwrite("multi_stream_target_latency_percentile", + &TestSettings::multi_stream_target_latency_percentile) + .def_readwrite("multi_stream_samples_per_query", + &TestSettings::multi_stream_samples_per_query) + .def_readwrite("multi_stream_max_async_queries", + &TestSettings::multi_stream_max_async_queries) + .def_readwrite("server_target_qps", &TestSettings::server_target_qps) + .def_readwrite("server_target_latency_ns", + &TestSettings::server_target_latency_ns) + .def_readwrite("server_target_latency_percentile", + &TestSettings::server_target_latency_percentile) + .def_readwrite("server_coalesce_queries", + &TestSettings::server_coalesce_queries) + .def_readwrite("server_find_peak_qps_decimals_of_precision", + &TestSettings::server_find_peak_qps_decimals_of_precision) + .def_readwrite("server_find_peak_qps_boundary_step_size", + &TestSettings::server_find_peak_qps_boundary_step_size) + .def_readwrite("server_max_async_queries", + &TestSettings::server_max_async_queries) + .def_readwrite("offline_expected_qps", + &TestSettings::offline_expected_qps) + .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms) + .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms) + .def_readwrite("min_query_count", &TestSettings::min_query_count) + .def_readwrite("max_query_count", &TestSettings::max_query_count) + .def_readwrite("qsl_rng_seed", &TestSettings::qsl_rng_seed) + .def_readwrite("sample_index_rng_seed", + &TestSettings::sample_index_rng_seed) + .def_readwrite("schedule_rng_seed", &TestSettings::schedule_rng_seed) + .def_readwrite("accuracy_log_rng_seed", + &TestSettings::accuracy_log_rng_seed) + .def_readwrite("accuracy_log_probability", + &TestSettings::accuracy_log_probability) + .def_readwrite("print_timestamps", &TestSettings::print_timestamps) + .def_readwrite("performance_issue_unique", + &TestSettings::performance_issue_unique) + .def_readwrite("performance_issue_same", + &TestSettings::performance_issue_same) + .def_readwrite("performance_issue_same_index", + &TestSettings::performance_issue_same_index) + .def_readwrite("performance_sample_count_override", + &TestSettings::performance_sample_count_override) + .def("FromConfig", &TestSettings::FromConfig, "FromConfig."); + + pybind11::enum_(m, "LoggingMode") + .value("AsyncPoll", LoggingMode::AsyncPoll) + .value("EndOfTestOnly", LoggingMode::EndOfTestOnly) + .value("Synchronous", LoggingMode::Synchronous); + + pybind11::class_(m, "LogOutputSettings") + .def(pybind11::init<>()) + .def_readwrite("outdir", &LogOutputSettings::outdir) + .def_readwrite("prefix", &LogOutputSettings::prefix) + .def_readwrite("suffix", &LogOutputSettings::suffix) + .def_readwrite("prefix_with_datetime", + &LogOutputSettings::prefix_with_datetime) + .def_readwrite("copy_detail_to_stdout", + &LogOutputSettings::copy_detail_to_stdout) + .def_readwrite("copy_summary_to_stdout", + &LogOutputSettings::copy_summary_to_stdout); + + pybind11::class_(m, "LogSettings") + .def(pybind11::init<>()) + .def_readwrite("log_output", &LogSettings::log_output) + .def_readwrite("log_mode", &LogSettings::log_mode) + .def_readwrite("log_mode_async_poll_interval_ms", + &LogSettings::log_mode_async_poll_interval_ms) + .def_readwrite("enable_trace", &LogSettings::enable_trace); + + pybind11::class_(m, "QuerySample") + .def(pybind11::init<>()) + .def(pybind11::init()) + .def_readwrite("id", &QuerySample::id) + .def_readwrite("index", &QuerySample::index) + .def(pybind11::pickle( + [] (const QuerySample &qs) { // __getstate__ + /*Return a tuple that fully encodes state of object*/ + return pybind11::make_tuple(qs.id, qs.index); + }, + [] (pybind11::tuple t) { // __setstate__ + if (t.size() != 2) + throw std::runtime_error("Invalid state for QuerySample"); + /* Create a new C++ instance*/ + QuerySample q; + q.id = t[0].cast(); + q.index = t[1].cast(); + return q; + })); + + pybind11::class_(m, "QuerySampleResponse") + .def(pybind11::init<>()) + .def(pybind11::init()) + .def_readwrite("id", &QuerySampleResponse::id) + .def_readwrite("data", &QuerySampleResponse::data) + .def_readwrite("size", &QuerySampleResponse::size) + .def(pybind11::pickle( + [] (const QuerySampleResponse &qsr) { // __getstate__ + /* Return a tuple that fully encodes state of object*/ + return pybind11::make_tuple(qsr.id, qsr.data, qsr.size); + }, + [] (pybind11::tuple t) { // __setstate__ + if (t.size() != 3) + throw std::runtime_error("Invalid state for QuerySampleResponse"); + /* Create a new C++ instance*/ + QuerySampleResponse q; + q.id = t[0].cast(); + q.data = t[1].cast(); + q.size = t[2].cast(); + return q; + })); + + // TODO: Use PYBIND11_MAKE_OPAQUE for the following vector types. + pybind11::bind_vector>(m, "VectorQuerySample"); + pybind11::bind_vector>( + m, "VectorQuerySampleResponse"); + + m.def("ConstructSUT", &py::ConstructSUT, "Construct the system under test."); + m.def("DestroySUT", &py::DestroySUT, + "Destroy the object created by ConstructSUT."); + + m.def("ConstructFastSUT", &py::ConstructFastSUT, + "Construct the system under test, fast issue query"); + m.def("DestroyFastSUT", &py::DestroyFastSUT, + "Destroy the object created by ConstructFastSUT."); + + m.def("ConstructQSL", &py::ConstructQSL, + "Construct the query sample library."); + m.def("DestroyQSL", &py::DestroyQSL, + "Destroy the object created by ConstructQSL."); + + m.def("StartTest", &py::StartTest, + "Run tests on a SUT created by ConstructSUT() with the provided QSL. " + "Uses default log settings."); + m.def("StartTestWithLogSettings", &py::StartTestWithLogSettings, + "Run tests on a SUT created by ConstructSUT() with the provided QSL. " + "Accepts custom log settings."); + m.def("QuerySamplesComplete", &py::QuerySamplesComplete, + "Called by the SUT to indicate that samples from some combination of" + "IssueQuery calls have finished.", pybind11::arg("responses"), pybind11::arg("response_cb") = ResponseCallback{}); +} + +} // namespace py +} // namespace mlperf + +#endif // PYTHON_BINDINGS_H diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream.py new file mode 100644 index 0000000..141b27a --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream.py @@ -0,0 +1,92 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +from absl import app +import mlperf_loadgen +import numpy + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in 3 slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(.001 * (i_slice + 1)) + responses = [] + samples_to_complete = query_samples[i_slice:len(query_samples):3] + for s in samples_to_complete: + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, + args=(query_samples, 0)).start() + threading.Thread(target=process_query_async, + args=(query_samples, 1)).start() + threading.Thread(target=process_query_async, + args=(query_samples, 2)).start() + + +def flush_queries(): + pass + + +def process_latencies(latencies_ns): + print("Average latency: ") + print(numpy.mean(latencies_ns)) + print("Median latency: ") + print(numpy.percentile(latencies_ns, 50)) + print("90 percentile latency: ") + print(numpy.percentile(latencies_ns, 90)) + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.MultiStream + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.multi_stream_target_latency_ns = 100000000 + settings.multi_stream_samples_per_query = 4 + settings.multi_stream_max_async_queries = 2 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + + sut = mlperf_loadgen.ConstructSUT( + issue_query, flush_queries, process_latencies) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream_free.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream_free.py new file mode 100644 index 0000000..a603059 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream_free.py @@ -0,0 +1,92 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +from absl import app +import mlperf_loadgen +import numpy + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in 3 slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(.001 * (i_slice + 1)) + responses = [] + samples_to_complete = query_samples[i_slice:len(query_samples):3] + for s in samples_to_complete: + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, + args=(query_samples, 0)).start() + threading.Thread(target=process_query_async, + args=(query_samples, 1)).start() + threading.Thread(target=process_query_async, + args=(query_samples, 2)).start() + + +def flush_queries(): + pass + + +def process_latencies(latencies_ns): + print("Average latency: ") + print(numpy.mean(latencies_ns)) + print("Median latency: ") + print(numpy.percentile(latencies_ns, 50)) + print("90 percentile latency: ") + print(numpy.percentile(latencies_ns, 90)) + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.MultiStreamFree + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.multi_stream_target_latency_ns = 100000000 + settings.multi_stream_samples_per_query = 4 + settings.multi_stream_max_async_queries = 2 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + + sut = mlperf_loadgen.ConstructSUT( + issue_query, flush_queries, process_latencies) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_offline.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_offline.py new file mode 100644 index 0000000..c152530 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_offline.py @@ -0,0 +1,88 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +from absl import app +import mlperf_loadgen +import numpy + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in 3 slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(3 * (i_slice + 1)) + responses = [] + samples_to_complete = query_samples[i_slice:len(query_samples):3] + for s in samples_to_complete: + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, + args=(query_samples, 0)).start() + threading.Thread(target=process_query_async, + args=(query_samples, 1)).start() + threading.Thread(target=process_query_async, + args=(query_samples, 2)).start() + + +def flush_queries(): + pass + + +def process_latencies(latencies_ns): + print("Average latency: ") + print(numpy.mean(latencies_ns)) + print("Median latency: ") + print(numpy.percentile(latencies_ns, 50)) + print("90 percentile latency: ") + print(numpy.percentile(latencies_ns, 90)) + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Offline + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.offline_expected_qps = 1000 + + sut = mlperf_loadgen.ConstructSUT( + issue_query, flush_queries, process_latencies) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_server.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_server.py new file mode 100644 index 0000000..75aa82f --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_server.py @@ -0,0 +1,85 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +from absl import app +import mlperf_loadgen +import numpy + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + time.sleep(.001) + responses = [] + for s in query_samples: + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, + args=[query_samples]).start() + + +def flush_queries(): + pass + + +def process_latencies(latencies_ns): + print("Average latency: ") + print(numpy.mean(latencies_ns)) + print("Median latency: ") + print(numpy.percentile(latencies_ns, 50)) + print("99 percentile latency: ") + print(numpy.percentile(latencies_ns, 99)) + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Server + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.server_target_qps = 100 + settings.server_target_latency_ns = 100000000 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + + sut = mlperf_loadgen.ConstructSUT( + issue_query, flush_queries, process_latencies) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_single_stream.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_single_stream.py new file mode 100644 index 0000000..53efa42 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_single_stream.py @@ -0,0 +1,93 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import array +import threading +import time + +from absl import app +import mlperf_loadgen +import numpy + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + """Processes the list of queries.""" + time.sleep(.001) + responses = [] + response_array = array.array( + "f", [0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 254, 255]) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + for s in query_samples: + responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, + args=[query_samples]).start() + + +def flush_queries(): + pass + + +def process_latencies(latencies_ns): + print("Average latency: ") + print(numpy.mean(latencies_ns)) + print("Median latency: ") + print(numpy.percentile(latencies_ns, 50)) + print("90 percentile latency: ") + print(numpy.percentile(latencies_ns, 90)) + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.SingleStream + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.single_stream_expected_latency_ns = 1000000 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + + sut = mlperf_loadgen.ConstructSUT( + issue_query, flush_queries, process_latencies) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/BUILD.gn b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/BUILD.gn new file mode 100644 index 0000000..865bc4d --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/BUILD.gn @@ -0,0 +1,33 @@ +generated_doxygen_out_dir = + get_path_info(".", "gen_dir") + "/.." + +loadgen_doxygen_sources = [ + "doxygen.cfg", + "doxygen_footer.html", + "doxygen_header.html", + "doxygen_layout.xml", + "doxygen_stylesheet.css", + "loadgen-integration_diagram.dia", + "mlperf_icon.png", + "mlperf_logo_horizontal_color.svg", + "README.md" +] + +source_set("loadgen_doxygen_sources") { + sources = loadgen_doxygen_sources +} + +source_set("doxygen_html_generator_script") { + sources = [ "doxygen_html_generator.py" ] +} + +action("generate_doxygen_html") { + script = "doxygen_html_generator.py" + args = [ rebase_path(generated_doxygen_out_dir, root_build_dir), + rebase_path("../..") ] + outputs = [ generated_doxygen_out_dir ] + deps = [ ":loadgen_doxygen_sources", + ":doxygen_html_generator_script", + "../..:mlperf_loadgen_sources_no_gen", + "../..:docs" ] +} diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/README.md b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/README.md new file mode 100644 index 0000000..d5cf5fe --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/README.md @@ -0,0 +1,34 @@ +# Generating the HTML docs {#ReadmeHtmlDocs} + +This document is generated from inline docstrings in the source and +various markdown files checked into the git repository. If you've +checked out the code, you can generate this documentation. + +*Prerequisite:* You must have [doxygen](http://www.doxygen.nl) installed +on your system: + +## With gn / ninja + +If you are using the gn build flow, you may run: + + ninja -C out/Release generate_doxygen_html + +* This will output the documentation to out/Release/gen/loadgen/docs/gen and +avoid poluting the source directory. + +## Manually + +Alternatively, you can manually run: + + python docs/src/doxygen_html_generator.py + +* If is omitted, it will default to ".". +* If is also omitted, it will default to "./docs/gen". + +## Hosting + +A version of this doc is currently hosted online at +https://mlperf.github.io/inference/loadgen/index.html + +To update the hosted version, submit a PR to the +[mlperf.github.io](https://github.com/mlperf/mlperf.github.io) repository. diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen.cfg b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen.cfg new file mode 100644 index 0000000..fc05853 --- /dev/null +++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen.cfg @@ -0,0 +1,2495 @@ +# Doxyfile 1.8.13 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "LoadGen Guide" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_logo_horizontal_color.svg + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = $(MLPERF_DOXYGEN_OUT_PATH) + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = YES + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 0. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 1 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = YES + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_layout.xml + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = NO + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = $(MLPERF_LOADGEN_SRC_PATH) + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = depot_tools + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = $(MLPERF_LOADGEN_SRC_PATH)/docs/src + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = YES + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = -I ../third_party/pybind/include --std=c++14 + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot o= +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_header.html + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_footer.html + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_stylesheet.css + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_icon.png \ + $(MLPERF_LOADGEN_SRC_PATH)/loadgen_integration_diagram.svg + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 127 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = YES + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 50 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = YES + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /