diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..5e97f52
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length=120
+ignore = E203,E305,E402,E721,E741,F401,F403,F405,F821,F841,F999,W503,W504
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d826437
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,145 @@
+# Data files
+.data
+
+# Results folders
+run_kaggle_pt/
+results/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..8b32f06
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "param"]
+	path = param
+	url = https://github.com/facebookresearch/param.git
+[submodule "benchmarks/rnnt/ootb/inference/third_party/pybind"]
+	path = benchmarks/rnnt/ootb/inference/third_party/pybind
+	url = https://github.com/pybind/pybind11.git
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..08b500a
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,80 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..1665402
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# Contributing to proxyworkloads
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to proxyworkloads, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..bb84d29
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,191 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+   
+   Copyright 2021 Meta Platforms, Inc. and its affiliates.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..2bb9ad2
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,176 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f2e38c0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,131 @@
+# Proxy Workloads
+
+These benchmarks represent important workloads. The faster these benchmarks are, the happier owners of important workloads are. The maintainers, updates, and rules in this benchmark suite all exist to keep the connection between the people running these benchmarks and the people running the original workloads.
+
+The key things to know:
+- These benchmarks are directly connected to real workloads run every day
+- The main metric is throughput, subject to some constraints such as latency or max batchsize
+- Data is often synthetic, though we have safeguards to ensure correctness
+- There are special requirements when improving these benchmarks - it's not "anything goes"
+- This includes benchmarks (runnable on 1 device, multiple devices, clusters) and microbenchmarks
+
+
+To get starting running the benchmark suite right away on a V100:
+
+    cd proxyworkloads/benchmarks
+    ./run_all.sh
+
+
+## The Suite
+
+This suite captures benchmarks across multiple devices, across multiple precisions, and includes microbenchmarks. We organize the suite so each benchmark result is identified as:
+
+    Benchmark = Models + Implementation + Mode + Configuration
+
+### Models
+This suite contains the following benchmarks:
+- Recommendation: DLRM
+- Text: XLM-R (WIP)
+- Vision: CVT (Planned)
+- Text: OSCAR (Planned)
+- Speech: RNN-T (WIP)
+- Video: Resnext-3D (Planned)
+- Image: Regnet-Y (Planned)
+
+### Implementation
+
+Each benchmark comes in three different implementations:
+- Out Of The Box (OOTB): indicates the performance that is provided by the libraries and frameworks. Code is written like a regular AI engineer / researcher would write the code, not like a systems/hardware specialist would write the code.
+- Optimized: Represents the best possible performance which can be reached; the code is tuned, re-written (and perhaps even mangled) by hardware and software experts
+- Microbenchmarks: benchmarks which look at a specific component of dev, computer or cluster. These are highly unique and specialized in their purpose.
+
+### Modes
+
+For OOTB and optimized implementations, the modes are Inference and Training. For Microbenchmarks, the mode is the specific kind of microbenchmark being run.
+
+### Configurations
+
+Each implementation comes in multiple configurations. Each configuration looks at the benchmark in a different way, such as:
+- The model and data scaled to different number of devices: e.g. 1, 8, multiple node
+- Different precisions and numeric formats
+- Different variants of the models, representing possible different layers or sizes the model might be run at.
+
+## Results
+
+Running one or more benchmarks on a specific machine or cluster produces a results table. Below are example results which you may get.
+
+|Model                    |Implementation|Mode        |Config             |Batch Size|Score |Units|
+|-------------------------|--------------|------------|-------------------|----------|------|-----|
+|Recommend: DLRM          |OOTB          |Training    |A.1dev-embed32-fp32|1024      |570.16|ex/s |
+|Recommend: DLRM          |OOTB          |Inference   |A.1dev-embed4-fp32 |1024      |61.85*|ex/s |
+|Recommend: DLRM          |Micro         |MLP/Linear  |linear_A.1dev      |256       |7.08  |TF/s |
+|Recommend: DLRM          |Micro         |EmbeddingBag|emb_A.1dev         |65536     |537.80|GB/s |
+* = missed latency target
+
+Notice the following in this table:
+- Each row is one Benchmark run with a batch size (`Model + Implementation + Mode + Config` at a given batch size). More on batch size in Suite Design.
+- All rows in the same table are run on the same machine. Benchmarks from different hardware must appear in different result tables.
+- Some results have a `*` denoting that they missed the latency target. More on latency targets in Suite Design.
+- You may report multiple batch sizes for the same benchmark, they appear as different lines in the table.
+
+
+### Results by System Scale
+We look at all the results to understand the broader picture of performance.
+
+** For systems that can't run the full model: ** Microbenchmarks give us a picture into potential performance and early indicators of where to explore more.
+
+** For single device systems: ** For training, single device configurations and microbenchmarks can indicate trends in overall cluster performance; microbenchmarks run on the cluster paired with single device results can indicate if single device performance is in fact the bottleneck. For inference, single inference is often easily parallelizable across multiple devices, the single device benchmarks are a very good indicator of real performance. This has the added advantage of being quick and easy for debugging and experiments.
+
+** For multiple device, single node: ** For Training, multidevice configurations give good insight into how single nodes perform within a cluster - this can be combined with microbenchmarks on the cluster to predict overall performance. For inference, this is a great reflection of actual workloads. This has the added advantage of being quick and easy for debugging and experiments.
+
+** For Clusters: ** Running these benchmarks on a cluster gives the best indication of performance for Training but does not add additional information for Inference. The downside is, obviously, these runs are more costly to set up and run.
+
+
+### How Results are Consumed
+There are two broad comparisons that can be done: hardware-to-hardware and OOTB v. Optimized.
+
+- System to System: Compare two tables generated by two different systems to understand their differences
+- OOTB v. Optimized: Look at one table, one system, and understand the gap between the software (compilers, frameworks, and libraries) and what might be possible if the software was improved.
+
+Generally, consuming results is specific to the situation. Different goals will result in placing different priorities and weights when evaluating results so there isn't a one size fits all approach here. It's up to the people and situation.
+
+
+## Suite Design
+We are very specific about how these benchmarks must be run and optimized in order to maintain our goal: ** improvements to these benchmarks connect directly to improvements in important internal workloads **. Where our methodology may seem arbitrary or cumbersome, it is in service of maintaining the connection to the source.
+
+### Ownership, Versions & Updates
+Each Benchmark (`Model + Implementation + Mode + Config`) is connected with an actual owner of an actual workload who endorsed the benchmark. The owner is the arbiter of changes, updates, and methodology for the benchmark. It is exceptionally frustrating to see benchmarks change while you are working on them. It sucks, and we version our benchmarks to help with bookkeeping. Ultimately, our goal here is to reflect the current state of what people care about - unfortunately this means (sometimes too frequently) bumping versions to ensure we are offering the best proxy to the world.
+
+### Convergence and Accuracy
+The gold standard in understanding how the system works is measuring convergence and accuracy of the model in the end-to-end context. Unfortunately, as shown by MLPerf, this is exceptionally costly, burdensome and slow. We do not place an emphasis on convergence and accuracy for the following reasons:
+- We don't allow significant changes to model code (see "Improving the Benchmark Score"), so we don't expect people to be breaking convergence
+- We limit the data types and precisions to ones we understand and are known to be viable
+- We (will) offer the ability to verify correctness (possibly through real data or through statistical analysis on synthetic data)
+- We lean on benchmarks in MLPerf which has a similar suite of models and submissions to MLPerf are required to test correctness.
+
+Overall, we aim to allow benchmarking at the granularity which is usable by people in their projects, representative of the actual workloads, and not overly cumbersome or expensive. It's a compromise.
+
+### Data
+As discussed in Convergence and Accuracy, we are not an accuracy or convergence benchmark. This frees us up to use synthetic data which significantly improves usability and time-to-results for this suite.
+
+We may choose to use real data, or data derived from real data, where we cannot generate proper synthetic data.
+
+### Batch Sizes
+Generally speaking, the bigger the batch size the better the throughput but the longer the time to converge and the higher the latency. When running these benchmarks, people will want to see:
+- The benchmark run at specific known batch sizes (where the convergence is understood) to allow for predicting and modeling
+- The benchmark at the batch size which gives the best throughput, subject to either (a) a maximum batchsize for which the model will converge, or (b) a latency requirement for requests.
+
+### Latency Limits
+Inference benchmarks come with latency limits and the goal is to provide the best QPS while hitting the latency limit. Some inference benchmarks may reflect user facing operations where latency is key. Some inference benchmarks may reflect background jobs where throughput is key - so the latency limit is very high in these cases.
+
+## Improving the Benchmark Score
+The bigger the score, the better - but there are limits on how to get there. The limits depend on the implementation (Out-Of-The-Box OOTB, Optimized, or Microbenchmark).
+
+- Out-Of-The-Box (OOTB): Improvements must come in through libraries, frameworks, and new hardware. No changing the model code (special exceptions for non-optimizing changes which enable porting to new hardware).
+- Optimized: No holds barred - make the system shine. Just keep in mind everything you do, you're asking the actual people who run the workloads to do it too if they're going to realize that performance. You'll need to describe what changes you made, so keep track.
+- Microbenchmarks - Implement the same operation as defined, and make it as fast as possible.
+
+## License
+
+This is released under the APACHE 2 license. Please see the [`LICENSE`](LICENSE) file for more information.
+
diff --git a/benchmarks/dlrm/ootb/CODE_OF_CONDUCT.md b/benchmarks/dlrm/ootb/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..0f7ad8b
--- /dev/null
+++ b/benchmarks/dlrm/ootb/CODE_OF_CONDUCT.md
@@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
diff --git a/benchmarks/dlrm/ootb/CONTRIBUTING.md b/benchmarks/dlrm/ootb/CONTRIBUTING.md
new file mode 100644
index 0000000..cc013a1
--- /dev/null
+++ b/benchmarks/dlrm/ootb/CONTRIBUTING.md
@@ -0,0 +1,36 @@
+# Contributing to DLRM
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style
+* 4 spaces for indentation rather than tabs
+* 80 character line length
+* in general, please maintain a consistent style with the rest of the code
+
+## License
+By contributing to DLRM, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/benchmarks/dlrm/ootb/Dockerfile b/benchmarks/dlrm/ootb/Dockerfile
new file mode 100644
index 0000000..0e4b750
--- /dev/null
+++ b/benchmarks/dlrm/ootb/Dockerfile
@@ -0,0 +1,15 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+ARG FROM_IMAGE_NAME=pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
+FROM ${FROM_IMAGE_NAME}
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+
+RUN pip install torch==1.3.1
+
+WORKDIR /code
+ADD . .
diff --git a/benchmarks/dlrm/ootb/README.md b/benchmarks/dlrm/ootb/README.md
new file mode 100644
index 0000000..7096b83
--- /dev/null
+++ b/benchmarks/dlrm/ootb/README.md
@@ -0,0 +1,389 @@
+Deep Learning Recommendation Model for Personalization and Recommendation Systems:
+=================================================================================
+*Copyright (c) Facebook, Inc. and its affiliates.*
+
+Description:
+------------
+An implementation of a deep learning recommendation model (DLRM)
+The model input consists of dense and sparse features. The former is a vector
+of floating point values. The latter is a list of sparse indices into
+embedding tables, which consist of vectors of floating point values.
+The selected vectors are passed to mlp networks denoted by triangles,
+in some cases the vectors are interacted through operators (Ops).
+```
+output:
+                    probability of a click
+model:                        |
+                             /\
+                            /__\
+                              |
+      _____________________> Op  <___________________
+    /                         |                      \
+   /\                        /\                      /\
+  /__\                      /__\           ...      /__\
+   |                          |                       |
+   |                         Op                      Op
+   |                    ____/__\_____           ____/__\____
+   |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+input:
+[ dense features ]     [sparse indices] , ..., [sparse indices]
+```
+ More precise definition of model layers:
+ 1) fully connected layers of an mlp
+
+    z = f(y)
+
+    y = Wx + b
+
+ 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+
+    z = Op(e1,...,ek)
+
+    obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+
+ 3) Operator Op can be one of the following
+
+    Sum(e1,...,ek) = e1 + ... + ek
+
+    Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+
+    Cat(e1,...,ek) = [e1', ..., ek']'
+
+    where ' denotes transpose operation
+
+Cite [Work](https://arxiv.org/abs/1906.00091):
+```
+@article{DLRM19,
+  author    = {Maxim Naumov and Dheevatsa Mudigere and Hao{-}Jun Michael Shi and Jianyu Huang and Narayanan Sundaraman and Jongsoo Park and Xiaodong Wang and Udit Gupta and Carole{-}Jean Wu and Alisson G. Azzolini and Dmytro Dzhulgakov and Andrey Mallevich and Ilia Cherniavskii and Yinghai Lu and Raghuraman Krishnamoorthi and Ansha Yu and Volodymyr Kondratenko and Stephanie Pereira and Xianjie Chen and Wenlin Chen and Vijay Rao and Bill Jia and Liang Xiong and Misha Smelyanskiy},
+  title     = {Deep Learning Recommendation Model for Personalization and Recommendation Systems},
+  journal   = {CoRR},
+  volume    = {abs/1906.00091},
+  year      = {2019},
+  url       = {https://arxiv.org/abs/1906.00091},
+}
+```
+
+Related Work:
+
+On the [system architecture implications](https://arxiv.org/abs/1906.03109), with DLRM as one of the benchmarks,
+```
+@article{ArchImpl19,
+  author    = {Udit Gupta and Xiaodong Wang and Maxim Naumov and Carole{-}Jean Wu and Brandon Reagen and David Brooks and Bradford Cottel and Kim M. Hazelwood and Bill Jia and Hsien{-}Hsin S. Lee and Andrey Malevich and Dheevatsa Mudigere and Mikhail Smelyanskiy and Liang Xiong and Xuan Zhang},
+  title     = {The Architectural Implications of Facebook's DNN-based Personalized Recommendation},
+  journal   = {CoRR},
+  volume    = {abs/1906.03109},
+  year      = {2019},
+  url       = {https://arxiv.org/abs/1906.03109},
+}
+```
+
+On the [embedding compression techniques (for number of vectors)](https://arxiv.org/abs/1909.02107), with DLRM as one of the benchmarks,
+```
+@article{QuoRemTrick19,
+  author    = {Hao{-}Jun Michael Shi and Dheevatsa Mudigere and Maxim Naumov and Jiyan Yang},
+  title     = {Compositional Embeddings Using Complementary Partitions for Memory-Efficient Recommendation Systems},
+  journal   = {CoRR},
+  volume    = {abs/1909.02107},
+  year      = {2019},
+  url       = {https://arxiv.org/abs/1909.02107},
+}
+```
+
+On the [embedding compression techniques (for dimension of vectors)](https://arxiv.org/abs/1909.11810), with DLRM as one of the benchmarks,
+```
+@article{MixDimTrick19,
+  author    = {Antonio Ginart and Maxim Naumov and Dheevatsa Mudigere and Jiyan Yang and James Zou},
+  title     = {Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation Systems},
+  journal   = {CoRR},
+  volume    = {abs/1909.11810},
+  year      = {2019},
+  url       = {https://arxiv.org/abs/1909.11810},
+}
+```
+
+Implementation
+--------------
+**DLRM PyTorch**. Implementation of DLRM in PyTorch framework:
+
+       dlrm_s_pytorch.py
+
+**DLRM Caffe2**. Implementation of DLRM in Caffe2 framework:
+
+       dlrm_s_caffe2.py
+
+**DLRM Data**. Implementation of DLRM data generation and loading:
+
+       dlrm_data_pytorch.py, dlrm_data_caffe2.py, data_utils.py
+
+**DLRM Tests**. Implementation of DLRM tests in ./test
+
+       dlrm_s_test.sh
+
+**DLRM Benchmarks**. Implementation of DLRM benchmarks in ./bench
+
+       dlrm_s_criteo_kaggle.sh, dlrm_s_criteo_terabyte.sh, dlrm_s_benchmark.sh
+
+Related Work:
+
+On the [Glow framework](https://github.com/pytorch/glow) implementation
+```
+https://github.com/pytorch/glow/blob/master/tests/unittests/RecommendationSystemTest.cpp
+```
+On the [FlexFlow framework](https://github.com/flexflow/FlexFlow) distributed implementation with Legion backend
+```
+https://github.com/flexflow/FlexFlow/blob/master/examples/cpp/DLRM/dlrm.cc
+```
+
+How to run dlrm code?
+--------------------
+1) A sample run of the code, with a tiny model is shown below
+```
+$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6
+time/loss/accuracy (if enabled):
+Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000%
+Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000%
+Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000%
+```
+2) A sample run of the code, with a tiny model in debug mode
+```
+$ python dlrm_s_pytorch.py --mini-batch-size=2 --data-size=6 --debug-mode
+model arch:
+mlp top arch 3 layers, with input to output dimensions:
+[8 4 2 1]
+# of interactions
+8
+mlp bot arch 2 layers, with input to output dimensions:
+[4 3 2]
+# of features (sparse and dense)
+4
+dense feature size
+4
+sparse feature size
+2
+# of embeddings (= # of sparse features) 3, with dimensions 2x:
+[4 3 2]
+data (inputs and targets):
+mini-batch: 0
+[[0.69647 0.28614 0.22685 0.55131]
+ [0.71947 0.42311 0.98076 0.68483]]
+[[[1], [0, 1]], [[0], [1]], [[1], [0]]]
+[[0.55679]
+ [0.15896]]
+mini-batch: 1
+[[0.36179 0.22826 0.29371 0.63098]
+ [0.0921  0.4337  0.43086 0.49369]]
+[[[1], [0, 2, 3]], [[1], [1, 2]], [[1], [1]]]
+[[0.15307]
+ [0.69553]]
+mini-batch: 2
+[[0.60306 0.54507 0.34276 0.30412]
+ [0.41702 0.6813  0.87546 0.51042]]
+[[[2], [0, 1, 2]], [[1], [2]], [[1], [1]]]
+[[0.31877]
+ [0.69197]]
+initial parameters (weights and bias):
+[[ 0.05438 -0.11105]
+ [ 0.42513  0.34167]
+ [-0.1426  -0.45641]
+ [-0.19523 -0.10181]]
+[[ 0.23667  0.57199]
+ [-0.16638  0.30316]
+ [ 0.10759  0.22136]]
+[[-0.49338 -0.14301]
+ [-0.36649 -0.22139]]
+[[0.51313 0.66662 0.10591 0.13089]
+ [0.32198 0.66156 0.84651 0.55326]
+ [0.85445 0.38484 0.31679 0.35426]]
+[0.17108 0.82911 0.33867]
+[[0.55237 0.57855 0.52153]
+ [0.00269 0.98835 0.90534]]
+[0.20764 0.29249]
+[[0.52001 0.90191 0.98363 0.25754 0.56436 0.80697 0.39437 0.73107]
+ [0.16107 0.6007  0.86586 0.98352 0.07937 0.42835 0.20454 0.45064]
+ [0.54776 0.09333 0.29686 0.92758 0.569   0.45741 0.75353 0.74186]
+ [0.04858 0.7087  0.83924 0.16594 0.781   0.28654 0.30647 0.66526]]
+[0.11139 0.66487 0.88786 0.69631]
+[[0.44033 0.43821 0.7651  0.56564]
+ [0.0849  0.58267 0.81484 0.33707]]
+[0.92758 0.75072]
+[[0.57406 0.75164]]
+[0.07915]
+DLRM_Net(
+  (emb_l): ModuleList(
+    (0): EmbeddingBag(4, 2, mode=sum)
+    (1): EmbeddingBag(3, 2, mode=sum)
+    (2): EmbeddingBag(2, 2, mode=sum)
+  )
+  (bot_l): Sequential(
+    (0): Linear(in_features=4, out_features=3, bias=True)
+    (1): ReLU()
+    (2): Linear(in_features=3, out_features=2, bias=True)
+    (3): ReLU()
+  )
+  (top_l): Sequential(
+    (0): Linear(in_features=8, out_features=4, bias=True)
+    (1): ReLU()
+    (2): Linear(in_features=4, out_features=2, bias=True)
+    (3): ReLU()
+    (4): Linear(in_features=2, out_features=1, bias=True)
+    (5): Sigmoid()
+  )
+)
+time/loss/accuracy (if enabled):
+Finished training it 1/3 of epoch 0, -1.00 ms/it, loss 0.451893, accuracy 0.000%
+Finished training it 2/3 of epoch 0, -1.00 ms/it, loss 0.402002, accuracy 0.000%
+Finished training it 3/3 of epoch 0, -1.00 ms/it, loss 0.275460, accuracy 0.000%
+updated parameters (weights and bias):
+[[ 0.0543  -0.1112 ]
+ [ 0.42513  0.34167]
+ [-0.14283 -0.45679]
+ [-0.19532 -0.10197]]
+[[ 0.23667  0.57199]
+ [-0.1666   0.30285]
+ [ 0.10751  0.22124]]
+[[-0.49338 -0.14301]
+ [-0.36664 -0.22164]]
+[[0.51313 0.66663 0.10591 0.1309 ]
+ [0.32196 0.66154 0.84649 0.55324]
+ [0.85444 0.38482 0.31677 0.35425]]
+[0.17109 0.82907 0.33863]
+[[0.55238 0.57857 0.52154]
+ [0.00265 0.98825 0.90528]]
+[0.20764 0.29244]
+[[0.51996 0.90184 0.98368 0.25752 0.56436 0.807   0.39437 0.73107]
+ [0.16096 0.60055 0.86596 0.98348 0.07938 0.42842 0.20453 0.45064]
+ [0.5476  0.0931  0.29701 0.92752 0.56902 0.45752 0.75351 0.74187]
+ [0.04849 0.70857 0.83933 0.1659  0.78101 0.2866  0.30646 0.66526]]
+[0.11137 0.66482 0.88778 0.69627]
+[[0.44029 0.43816 0.76502 0.56561]
+ [0.08485 0.5826  0.81474 0.33702]]
+[0.92754 0.75067]
+[[0.57379 0.7514 ]]
+[0.07908]
+```
+
+Testing
+-------
+Testing scripts to confirm functional correctness of the code
+```
+./test/dlrm_s_test.sh
+Running commands ...
+python dlrm_s_pytorch.py
+python dlrm_s_caffe2.py
+Checking results ...
+diff test1 (no numeric values in the output = SUCCESS)
+diff test2 (no numeric values in the output = SUCCESS)
+diff test3 (no numeric values in the output = SUCCESS)
+diff test4 (no numeric values in the output = SUCCESS)
+```
+
+*NOTE: Testing scripts accept extra arguments which will be passed along to the model, such as --use-gpu*
+
+Benchmarking
+------------
+1) Performance benchmarking
+    ```
+    ./bench/dlrm_s_benchmark.sh
+    ```
+
+2) The code supports interface with the [Criteo Kaggle Display Advertising Challenge Dataset](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/).
+   - Please do the following to prepare the dataset for use with DLRM code:
+     - First, specify the raw data file (train.txt) as downloaded with --raw-data-file=<path/train.txt>
+     - This is then pre-processed (categorize, concat across days...) to allow using with dlrm code
+     - The processed data is stored as *.npz file in <root_dir>/input/*.npz
+     - The processed file (*.npz) can be used for subsequent runs with --processed-data-file=<path/*.npz>
+   - The model can be trained using the following script
+     ```
+     ./bench/dlrm_s_criteo_kaggle.sh [--test-freq=1024]
+     ```
+
+<img src="./kaggle_dac_loss_accuracy_plots.png" width="900" height="320">
+
+3) The code supports interface with the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/).
+   - Please do the following to prepare the dataset for use with DLRM code:
+     - First, download the raw data files day_0.gz, ...,day_23.gz and unzip them
+     - Specify the location of the unzipped text files day_0, ...,day_23, using --raw-data-file=<path/day> (the day number will be appended automatically)
+     - These are then pre-processed (categorize, concat across days...) to allow using with dlrm code
+     - The processed data is stored as *.npz file in <root_dir>/input/*.npz
+     - The processed file (*.npz) can be used for subsequent runs with --processed-data-file=<path/*.npz>
+   - The model can be trained using the following script
+    ```
+      ./bench/dlrm_s_criteo_terabyte.sh ["--test-freq=10240 --memory-map --data-sub-sample-rate=0.875"]
+    ```
+    - Corresponding pre-trained model is available under [CC-BY-NC license](https://creativecommons.org/licenses/by-nc/2.0/) and can be downloaded here
+    [dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt)
+
+<img src="./terabyte_0875_loss_accuracy_plots.png" width="900" height="320">
+
+*NOTE: Benchmarking scripts accept extra arguments which will be passed along to the model, such as --num-batches=100 to limit the number of data samples*
+
+4) The code supports interface with [MLPerf benchmark](https://mlperf.org).
+   - Please refer to the following training parameters
+   ```
+     --mlperf-logging that keeps track of multiple metrics, including area under the curve (AUC)
+
+     --mlperf-acc-threshold that allows early stopping based on accuracy metric
+
+     --mlperf-auc-threshold that allows early stopping based on AUC metric
+
+     --mlperf-bin-loader that enables preprocessing of data into a single binary file
+
+     --mlperf-bin-shuffle that controls whether a random shuffle of mini-batches is performed
+   ```
+   - The MLPerf training model is completely specified and can be trained using the following script
+   ```
+     ./bench/run_and_time.sh [--use-gpu]
+   ```
+   - Corresponding pre-trained model is available under [CC-BY-NC license](https://creativecommons.org/licenses/by-nc/2.0/) and can be downloaded here
+     [dlrm_emb128_subsample0.0_maxindrange40M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt)
+
+5) The code now supports synchronous distributed training, we support gloo/nccl/mpi backend, we provide launching mode for [pytorch distributed launcher](https://pytorch.org/docs/stable/distributed.html#launch-utility) and Mpirun. For MPI, users need to write their own MPI launching scripts for configuring the running hosts. For example, using pytorch distributed launcher, we can have the following command as launching scripts:
+```
+# for single node 8 gpus and nccl as backend on randomly generated dataset:
+python -m torch.distributed.launch --nproc_per_node=8 dlrm_s_pytorch.py --arch-embedding-size="80000-80000-80000-80000-80000-80000-80000-80000" --arch-sparse-feature-size=64 --arch-mlp-bot="128-128-128-128" --arch-mlp-top="512-512-512-256-1" --max-ind-range=40000000
+--data-generation=random --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2 --print-time --test-freq=2 --test-mini-batch-size=2048 --memory-map --use-gpu --num-batches=100 --dist-backend=nccl
+
+# for multiple nodes, user can add the related argument according to the launcher manual like:
+--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" --master_port=1234
+```
+
+
+Model checkpoint saving/loading
+-------------------------------
+During training, the model can be saved using --save-model=<path/model.pt>
+
+The model is saved if there is an improvement in test accuracy (which is checked at --test-freq intervals).
+
+A previously saved model can be loaded using --load-model=<path/model.pt>
+
+Once loaded the model can be used to continue training, with the saved model being a checkpoint.
+Alternatively, the saved model can be used to evaluate only on the test data-set by specifying --inference-only option.
+
+
+Version
+-------
+0.1 : Initial release of the DLRM code
+
+1.0 : DLRM with distributed training, cpu support for row-wise adagrad optimizer
+
+Requirements
+------------
+pytorch-nightly (*11/10/20*)
+
+scikit-learn
+
+numpy
+
+onnx (*optional*)
+
+pydot (*optional*)
+
+torchviz (*optional*)
+
+mpi (*optional for distributed backend*)
+
+
+License
+-------
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
diff --git a/benchmarks/dlrm/ootb/bench/dlrm_s_benchmark.sh b/benchmarks/dlrm/ootb/bench/dlrm_s_benchmark.sh
new file mode 100755
index 0000000..c6a75e2
--- /dev/null
+++ b/benchmarks/dlrm/ootb/bench/dlrm_s_benchmark.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+cpu=1
+gpu=1
+pt=1
+c2=1
+
+ncores=28 #12 #6
+nsockets="0"
+
+ngpus="1 2 4 8"
+
+numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+dlrm_c2_bin="python dlrm_s_caffe2.py"
+
+data=random #synthetic
+print_freq=100
+rand_seed=727
+
+c2_net="async_scheduling"
+
+#Model param
+mb_size=2048 #1024 #512 #256
+nbatches=1000 #500 #100
+bot_mlp="512-512-64"
+top_mlp="1024-1024-1024-1"
+emb_size=64
+nindices=100
+emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
+interaction="dot"
+tnworkers=0
+tmb_size=16384
+
+#_args="--mini-batch-size="${mb_size}\
+_args=" --num-batches="${nbatches}\
+" --data-generation="${data}\
+" --arch-mlp-bot="${bot_mlp}\
+" --arch-mlp-top="${top_mlp}\
+" --arch-sparse-feature-size="${emb_size}\
+" --arch-embedding-size="${emb}\
+" --num-indices-per-lookup="${nindices}\
+" --arch-interaction-op="${interaction}\
+" --numpy-rand-seed="${rand_seed}\
+" --print-freq="${print_freq}\
+" --print-time"\
+" --enable-profiling "
+
+c2_args=" --caffe2-net-type="${c2_net}
+
+
+# CPU Benchmarking
+if [ $cpu = 1 ]; then
+  echo "--------------------------------------------"
+  echo "CPU Benchmarking - running on $ncores cores"
+  echo "--------------------------------------------"
+  if [ $pt = 1 ]; then
+    outf="model1_CPU_PT_$ncores.log"
+    outp="dlrm_s_pytorch.prof"
+    echo "-------------------------------"
+    echo "Running PT (log file: $outf)"
+    echo "-------------------------------"
+    cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf"
+    echo $cmd
+    eval $cmd
+    min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+    echo "Min time per iteration = $min"
+    # move profiling file(s)
+    mv $outp ${outf//".log"/".prof"}
+    mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
+
+  fi
+  if [ $c2 = 1 ]; then
+    outf="model1_CPU_C2_$ncores.log"
+    outp="dlrm_s_caffe2.prof"
+    echo "-------------------------------"
+    echo "Running C2 (log file: $outf)"
+    echo "-------------------------------"
+    cmd="$numa_cmd $dlrm_c2_bin --mini-batch-size=$mb_size $_args $c2_args $dlrm_extra_option 1> $outf 2> $outp"
+    echo $cmd
+    eval $cmd
+    min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+    echo "Min time per iteration = $min"
+    # move profiling file (collected from stderr above)
+    mv $outp ${outf//".log"/".prof"}
+  fi
+fi
+
+# GPU Benchmarking
+if [ $gpu = 1 ]; then
+  echo "--------------------------------------------"
+  echo "GPU Benchmarking - running on $ngpus GPUs"
+  echo "--------------------------------------------"
+  for _ng in $ngpus
+  do
+    # weak scaling
+    # _mb_size=$((mb_size*_ng))
+    # strong scaling
+    _mb_size=$((mb_size*1))
+    _gpus=$(seq -s, 0 $((_ng-1)))
+    cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus"
+    echo "-------------------"
+    echo "Using GPUS: "$_gpus
+    echo "-------------------"
+    if [ $pt = 1 ]; then
+      outf="model1_GPU_PT_$_ng.log"
+      outp="dlrm_s_pytorch.prof"
+      echo "-------------------------------"
+      echo "Running PT (log file: $outf)"
+      echo "-------------------------------"
+      cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf"
+      echo $cmd
+      eval $cmd
+      min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+      echo "Min time per iteration = $min"
+      # move profiling file(s)
+      mv $outp ${outf//".log"/".prof"}
+      mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
+    fi
+    if [ $c2 = 1 ]; then
+      outf="model1_GPU_C2_$_ng.log"
+      outp="dlrm_s_caffe2.prof"
+      echo "-------------------------------"
+      echo "Running C2 (log file: $outf)"
+      echo "-------------------------------"
+      cmd="$cuda_arg $dlrm_c2_bin --mini-batch-size=$_mb_size $_args $c2_args --use-gpu $dlrm_extra_option 1> $outf 2> $outp"
+      echo $cmd
+      eval $cmd
+      min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
+      echo "Min time per iteration = $min"
+      # move profiling file (collected from stderr above)
+      mv $outp ${outf//".log"/".prof"}
+    fi
+  done
+fi
diff --git a/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_kaggle.sh b/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_kaggle.sh
new file mode 100755
index 0000000..867d8c0
--- /dev/null
+++ b/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_kaggle.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have compiled PyTorch and caffe2
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+dlrm_c2_bin="python dlrm_s_caffe2.py"
+
+echo "run pytorch ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log
+
+echo "run caffe2 ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
+
+echo "done"
diff --git a/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_terabyte.sh b/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_terabyte.sh
new file mode 100755
index 0000000..5a4ee94
--- /dev/null
+++ b/benchmarks/dlrm/ootb/bench/dlrm_s_criteo_terabyte.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have compiled PyTorch and caffe2
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+dlrm_pt_bin="python dlrm_s_pytorch.py"
+dlrm_c2_bin="python dlrm_s_caffe2.py"
+
+echo "run pytorch ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log
+
+echo "run caffe2 ..."
+# WARNING: the following parameters will be set based on the data set
+# --arch-embedding-size=... (sparse feature sizes)
+# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
+$dlrm_c2_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_terabyte_c2.log
+
+echo "done"
diff --git a/benchmarks/dlrm/ootb/bench/run_and_time.sh b/benchmarks/dlrm/ootb/bench/run_and_time.sh
new file mode 100755
index 0000000..e241d80
--- /dev/null
+++ b/benchmarks/dlrm/ootb/bench/run_and_time.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have compiled PyTorch and caffe2
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
+
+echo "done"
diff --git a/benchmarks/dlrm/ootb/cython/cython_compile.py b/benchmarks/dlrm/ootb/cython/cython_compile.py
new file mode 100644
index 0000000..ffacf08
--- /dev/null
+++ b/benchmarks/dlrm/ootb/cython/cython_compile.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: compile .so from python code
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from setuptools import setup
+from Cython.Build import cythonize
+from distutils.extension import Extension
+
+ext_modules = [
+    Extension(
+        "data_utils_cython",
+        ["data_utils_cython.pyx"],
+        extra_compile_args=['-O3'],
+        extra_link_args=['-O3'],
+    )
+]
+
+setup(
+    name='data_utils_cython',
+    ext_modules=cythonize(ext_modules)
+)
diff --git a/benchmarks/dlrm/ootb/cython/cython_criteo.py b/benchmarks/dlrm/ootb/cython/cython_criteo.py
new file mode 100644
index 0000000..46a0b7d
--- /dev/null
+++ b/benchmarks/dlrm/ootb/cython/cython_criteo.py
@@ -0,0 +1,55 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: run dataset pre-processing in standalone mode
+# WARNING: These steps are required to work with Cython
+# 1. Instal Cython
+# > sudo yum install Cython
+# 2. Please copy data_utils.py into data_utils_cython.pyx
+# 3. Compile the data_utils_cython.pyx to generate .so
+# (it's important to keep extension .pyx rather than .py
+#  to ensure the C/C++ .so no .py is loaded at import time)
+# > python cython_compile.py build_ext --inplace
+# This should create data_utils_cython.so, which can be loaded below with "import"
+# 4. Run standalone datatset preprocessing to generate .npz files
+# a. Kaggle
+# > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
+#   --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
+# b. Terabyte
+# > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
+#   --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import data_utils_cython as duc
+
+if __name__ == "__main__":
+    ### import packages ###
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Preprocess Criteo dataset"
+    )
+    # model related parameters
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    args = parser.parse_args()
+
+    duc.loadDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map
+    )
diff --git a/benchmarks/dlrm/ootb/data_loader_terabyte.py b/benchmarks/dlrm/ootb/data_loader_terabyte.py
new file mode 100644
index 0000000..cf0db71
--- /dev/null
+++ b/benchmarks/dlrm/ootb/data_loader_terabyte.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import numpy as np
+from torch.utils.data import Dataset
+import torch
+import time
+import math
+from tqdm import tqdm
+import argparse
+
+
+class DataLoader:
+    """
+    DataLoader dedicated for the Criteo Terabyte Click Logs dataset
+    """
+
+    def __init__(
+            self,
+            data_filename,
+            data_directory,
+            days,
+            batch_size,
+            max_ind_range=-1,
+            split="train",
+            drop_last_batch=False
+    ):
+        self.data_filename = data_filename
+        self.data_directory = data_directory
+        self.days = days
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+
+        total_file = os.path.join(
+            data_directory,
+            data_filename + "_day_count.npz"
+        )
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"][np.array(days)]
+
+        self.length = sum(total_per_file)
+        if split == "test" or split == "val":
+            self.length = int(np.ceil(self.length / 2.))
+        self.split = split
+        self.drop_last_batch = drop_last_batch
+
+    def __iter__(self):
+        return iter(
+            _batch_generator(
+                self.data_filename, self.data_directory, self.days,
+                self.batch_size, self.split, self.drop_last_batch, self.max_ind_range
+            )
+        )
+
+    def __len__(self):
+        if self.drop_last_batch:
+            return self.length // self.batch_size
+        else:
+            return math.ceil(self.length / self.batch_size)
+
+
+def _transform_features(
+        x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False
+):
+    if max_ind_range > 0:
+        x_cat_batch = x_cat_batch % max_ind_range
+
+    if flag_input_torch_tensor:
+        x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1)
+        x_cat_batch = x_cat_batch.clone().detach().type(torch.long)
+        y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1)
+    else:
+        x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1)
+        x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long)
+        y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1)
+
+    batch_size = x_cat_batch.shape[0]
+    feature_count = x_cat_batch.shape[1]
+    lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1)
+
+    return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1)
+
+
+def _batch_generator(
+        data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range
+):
+    previous_file = None
+    for day in days:
+        filepath = os.path.join(
+            data_directory,
+            data_filename + "_{}_reordered.npz".format(day)
+        )
+
+        # print('Loading file: ', filepath)
+        with np.load(filepath) as data:
+            x_int = data["X_int"]
+            x_cat = data["X_cat"]
+            y = data["y"]
+
+        samples_in_file = y.shape[0]
+        batch_start_idx = 0
+        if split == "test" or split == "val":
+            length = int(np.ceil(samples_in_file / 2.))
+            if split == "test":
+                samples_in_file = length
+            elif split == "val":
+                batch_start_idx = samples_in_file - length
+
+        while batch_start_idx < samples_in_file - batch_size:
+
+            missing_samples = batch_size
+            if previous_file is not None:
+                missing_samples -= previous_file['y'].shape[0]
+
+            current_slice = slice(batch_start_idx, batch_start_idx + missing_samples)
+
+            x_int_batch = x_int[current_slice]
+            x_cat_batch = x_cat[current_slice]
+            y_batch = y[current_slice]
+
+            if previous_file is not None:
+                x_int_batch = np.concatenate(
+                    [previous_file['x_int'], x_int_batch],
+                    axis=0
+                )
+                x_cat_batch = np.concatenate(
+                    [previous_file['x_cat'], x_cat_batch],
+                    axis=0
+                )
+                y_batch = np.concatenate([previous_file['y'], y_batch], axis=0)
+                previous_file = None
+
+            if x_int_batch.shape[0] != batch_size:
+                raise ValueError('should not happen')
+
+            yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range)
+
+            batch_start_idx += missing_samples
+        if batch_start_idx != samples_in_file:
+            current_slice = slice(batch_start_idx, samples_in_file)
+            if previous_file is not None:
+                previous_file = {
+                    'x_int' : np.concatenate(
+                        [previous_file['x_int'], x_int[current_slice]],
+                        axis=0
+                    ),
+                    'x_cat' : np.concatenate(
+                        [previous_file['x_cat'], x_cat[current_slice]],
+                        axis=0
+                    ),
+                    'y' : np.concatenate([previous_file['y'], y[current_slice]], axis=0)
+                }
+            else:
+                previous_file = {
+                    'x_int' : x_int[current_slice],
+                    'x_cat' : x_cat[current_slice],
+                    'y' : y[current_slice]
+                }
+
+    if not drop_last:
+        yield _transform_features(
+            previous_file['x_int'],
+            previous_file['x_cat'],
+            previous_file['y'],
+            max_ind_range
+        )
+
+
+def _test():
+    generator = _batch_generator(
+        data_filename='day',
+        data_directory='./input',
+        days=range(23),
+        split="train",
+        batch_size=2048,
+        drop_last=True,
+        max_ind_range=-1
+    )
+    t1 = time.time()
+    for x_int, lS_o, x_cat, y in generator:
+        t2 = time.time()
+        time_diff = t2 - t1
+        t1 = t2
+        print(
+            "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format(
+                time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape
+            )
+        )
+
+
+class CriteoBinDataset(Dataset):
+    """Binary version of criteo dataset."""
+
+    def __init__(self, data_file, counts_file,
+                 batch_size=1, max_ind_range=-1, bytes_per_feature=4):
+        # dataset
+        self.tar_fea = 1   # single target
+        self.den_fea = 13  # 13 dense  features
+        self.spa_fea = 26  # 26 sparse features
+        self.tad_fea = self.tar_fea + self.den_fea
+        self.tot_fea = self.tad_fea + self.spa_fea
+
+        self.batch_size = batch_size
+        self.max_ind_range = max_ind_range
+        self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
+
+        self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
+
+        print('data file:', data_file, 'number of batches:', self.num_entries)
+        self.file = open(data_file, 'rb')
+
+        with np.load(counts_file) as data:
+            self.counts = data["counts"]
+
+        # hardcoded for now
+        self.m_den = 13
+
+    def __len__(self):
+        return self.num_entries
+
+    def __getitem__(self, idx):
+        self.file.seek(idx * self.bytes_per_entry, 0)
+        raw_data = self.file.read(self.bytes_per_entry)
+        array = np.frombuffer(raw_data, dtype=np.int32)
+        tensor = torch.from_numpy(array).view((-1, self.tot_fea))
+
+        return _transform_features(x_int_batch=tensor[:, 1:14],
+                                   x_cat_batch=tensor[:, 14:],
+                                   y_batch=tensor[:, 0],
+                                   max_ind_range=self.max_ind_range,
+                                   flag_input_torch_tensor=True)
+
+    def __del__(self):
+        self.file.close()
+
+
+def numpy_to_binary(input_files, output_file_path, split='train'):
+    """Convert the data to a binary format to be read with CriteoBinDataset."""
+
+    # WARNING - both categorical and numerical data must fit into int32 for
+    # the following code to work correctly
+
+    with open(output_file_path, 'wb') as output_file:
+        if split == 'train':
+            for input_file in input_files:
+                print('Processing file: ', input_file)
+
+                np_data = np.load(input_file)
+                np_data = np.concatenate([np_data['y'].reshape(-1, 1),
+                                          np_data['X_int'],
+                                          np_data['X_cat']], axis=1)
+                np_data = np_data.astype(np.int32)
+
+                output_file.write(np_data.tobytes())
+        else:
+            assert len(input_files) == 1
+            np_data = np.load(input_files[0])
+            np_data = np.concatenate([np_data['y'].reshape(-1, 1),
+                                      np_data['X_int'],
+                                      np_data['X_cat']], axis=1)
+            np_data = np_data.astype(np.int32)
+
+            samples_in_file = np_data.shape[0]
+            midpoint = int(np.ceil(samples_in_file / 2.))
+            if split == "test":
+                begin = 0
+                end = midpoint
+            elif split == "val":
+                begin = midpoint
+                end = samples_in_file
+            else:
+                raise ValueError('Unknown split value: ', split)
+
+            output_file.write(np_data[begin:end].tobytes())
+
+
+def _preprocess(args):
+    train_files = ['{}_{}_reordered.npz'.format(args.input_data_prefix, day) for
+                   day in range(0, 23)]
+
+    test_valid_file = args.input_data_prefix + '_23_reordered.npz'
+
+    os.makedirs(args.output_directory, exist_ok=True)
+    for split in ['train', 'val', 'test']:
+        print('Running preprocessing for split =', split)
+
+        output_file = os.path.join(args.output_directory,
+                                   '{}_data.bin'.format(split))
+
+        input_files = train_files if split == 'train' else [test_valid_file]
+        numpy_to_binary(input_files=input_files,
+                        output_file_path=output_file,
+                        split=split)
+
+
+def _test_bin():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_directory', required=True)
+    parser.add_argument('--input_data_prefix', required=True)
+    parser.add_argument('--split', choices=['train', 'test', 'val'],
+                        required=True)
+    args = parser.parse_args()
+
+    _preprocess(args)
+
+    binary_data_file = os.path.join(args.output_directory,
+                                    '{}_data.bin'.format(args.split))
+
+    counts_file = os.path.join(args.output_directory, 'day_fea_count.npz')
+    dataset_binary = CriteoBinDataset(data_file=binary_data_file,
+                                            counts_file=counts_file,
+                                            batch_size=2048,)
+    from dlrm_data_pytorch import CriteoDataset 
+    from dlrm_data_pytorch import collate_wrapper_criteo_offset as collate_wrapper_criteo
+
+    binary_loader = torch.utils.data.DataLoader(
+        dataset_binary,
+        batch_size=None,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=None,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    original_dataset = CriteoDataset(
+        dataset='terabyte',
+        max_ind_range=10 * 1000 * 1000,
+        sub_sample_rate=1,
+        randomize=True,
+        split=args.split,
+        raw_path=args.input_data_prefix,
+        pro_data='dummy_string',
+        memory_map=True
+    )
+
+    original_loader = torch.utils.data.DataLoader(
+        original_dataset,
+        batch_size=2048,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=collate_wrapper_criteo,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    assert len(dataset_binary) == len(original_loader)
+    for i, (old_batch, new_batch) in tqdm(enumerate(zip(original_loader,
+                                                        binary_loader)),
+                                          total=len(dataset_binary)):
+
+        for j in range(len(new_batch)):
+            if not np.array_equal(old_batch[j], new_batch[j]):
+                raise ValueError('FAILED: Datasets not equal')
+        if i > len(dataset_binary):
+            break
+    print('PASSED')
+
+
+if __name__ == '__main__':
+    _test()
+    _test_bin()
diff --git a/benchmarks/dlrm/ootb/data_utils.py b/benchmarks/dlrm/ootb/data_utils.py
new file mode 100644
index 0000000..bf76dff
--- /dev/null
+++ b/benchmarks/dlrm/ootb/data_utils.py
@@ -0,0 +1,1292 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: generate inputs and targets for the DLRM benchmark
+#
+# Utility function(s) to download and pre-process public data sets
+#   - Criteo Kaggle Display Advertising Challenge Dataset
+#     https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
+#   - Criteo Terabyte Dataset
+#     https://labs.criteo.com/2013/12/download-terabyte-click-logs
+#
+# After downloading dataset, run:
+#   getCriteoAdData(
+#       datafile="<path-to-train.txt>",
+#       o_filename=kaggleAdDisplayChallenge_processed.npz,
+#       max_ind_range=-1,
+#       sub_sample_rate=0.0,
+#       days=7,
+#       data_split='train',
+#       randomize='total',
+#       criteo_kaggle=True,
+#       memory_map=False
+#   )
+#   getCriteoAdData(
+#       datafile="<path-to-day_{0,...,23}>",
+#       o_filename=terabyte_processed.npz,
+#       max_ind_range=-1,
+#       sub_sample_rate=0.0,
+#       days=24,
+#       data_split='train',
+#       randomize='total',
+#       criteo_kaggle=False,
+#       memory_map=False
+#   )
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+# import os
+from os import path
+from multiprocessing import Process, Manager
+# import io
+# from io import StringIO
+# import collections as coll
+
+import numpy as np
+
+
+def convertUStringToDistinctIntsDict(mat, convertDicts, counts):
+    # Converts matrix of unicode strings into distinct integers.
+    #
+    # Inputs:
+    #     mat (np.array): array of unicode strings to convert
+    #     convertDicts (list): dictionary for each column
+    #     counts (list): number of different categories in each column
+    #
+    # Outputs:
+    #     out (np.array): array of output integers
+    #     convertDicts (list): dictionary for each column
+    #     counts (list): number of different categories in each column
+
+    # check if convertDicts and counts match correct length of mat
+    if len(convertDicts) != mat.shape[1] or len(counts) != mat.shape[1]:
+        print("Length of convertDicts or counts does not match input shape")
+        print("Generating convertDicts and counts...")
+
+        convertDicts = [{} for _ in range(mat.shape[1])]
+        counts = [0 for _ in range(mat.shape[1])]
+
+    # initialize output
+    out = np.zeros(mat.shape)
+
+    for j in range(mat.shape[1]):
+        for i in range(mat.shape[0]):
+            # add to convertDict and increment count
+            if mat[i, j] not in convertDicts[j]:
+                convertDicts[j][mat[i, j]] = counts[j]
+                counts[j] += 1
+            out[i, j] = convertDicts[j][mat[i, j]]
+
+    return out, convertDicts, counts
+
+
+def convertUStringToDistinctIntsUnique(mat, mat_uni, counts):
+    # mat is an array of 0,...,# samples, with each being 26 categorical features
+
+    # check if mat_unique and counts match correct length of mat
+    if len(mat_uni) != mat.shape[1] or len(counts) != mat.shape[1]:
+        print("Length of mat_unique or counts does not match input shape")
+        print("Generating mat_unique and counts...")
+
+        mat_uni = [np.array([]) for _ in range(mat.shape[1])]
+        counts = [0 for _ in range(mat.shape[1])]
+
+    # initialize output
+    out = np.zeros(mat.shape)
+    ind_map = [np.array([]) for _ in range(mat.shape[1])]
+
+    # find out and assign unique ids to features
+    for j in range(mat.shape[1]):
+        m = mat_uni[j].size
+        mat_concat = np.concatenate((mat_uni[j], mat[:, j]))
+        mat_uni[j], ind_map[j] = np.unique(mat_concat, return_inverse=True)
+        out[:, j] = ind_map[j][m:]
+        counts[j] = mat_uni[j].size
+
+    return out, mat_uni, counts
+
+
+def processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, pre_comp_counts):
+    # Process Kaggle Display Advertising Challenge or Terabyte Dataset
+    # by converting unicode strings in X_cat to integers and
+    # converting negative integer values in X_int.
+    #
+    # Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day.
+    #
+    # Inputs:
+    #   d_path (str): path for {kaggle|terabyte}_day_i.npz files
+    #   i (int): splits in the dataset (typically 0 to 7 or 0 to 24)
+
+    # process data if not all files exist
+    filename_i = npzfile + "_{0}_processed.npz".format(i)
+
+    if path.exists(filename_i):
+        print("Using existing " + filename_i, end="\n")
+    else:
+        print("Not existing " + filename_i)
+        with np.load(npzfile + "_{0}.npz".format(i)) as data:
+            # categorical features
+            '''
+            # Approach 1a: using empty dictionaries
+            X_cat, convertDicts, counts = convertUStringToDistinctIntsDict(
+                data["X_cat"], convertDicts, counts
+            )
+            '''
+            '''
+            # Approach 1b: using empty np.unique
+            X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique(
+                data["X_cat"], convertDicts, counts
+            )
+            '''
+            # Approach 2a: using pre-computed dictionaries
+            X_cat_t = np.zeros(data["X_cat_t"].shape)
+            for j in range(26):
+                for k, x in enumerate(data["X_cat_t"][j, :]):
+                    X_cat_t[j, k] = convertDicts[j][x]
+            # continuous features
+            X_int = data["X_int"]
+            X_int[X_int < 0] = 0
+            # targets
+            y = data["y"]
+
+        np.savez_compressed(
+            filename_i,
+            # X_cat = X_cat,
+            X_cat=np.transpose(X_cat_t),  # transpose of the data
+            X_int=X_int,
+            y=y,
+        )
+        print("Processed " + filename_i, end="\n")
+    # sanity check (applicable only if counts have been pre-computed & are re-computed)
+    # for j in range(26):
+    #    if pre_comp_counts[j] != counts[j]:
+    #        sys.exit("ERROR: Sanity check on counts has failed")
+    # print("\nSanity check on counts passed")
+
+    return
+
+
+def concatCriteoAdData(
+        d_path,
+        d_file,
+        npzfile,
+        trafile,
+        days,
+        data_split,
+        randomize,
+        total_per_file,
+        total_count,
+        memory_map,
+        o_filename
+):
+    # Concatenates different days and saves the result.
+    #
+    # Inputs:
+    #   days (int): total number of days in the dataset (typically 7 or 24)
+    #   d_path (str): path for {kaggle|terabyte}_day_i.npz files
+    #   o_filename (str): output file name
+    #
+    # Output:
+    #   o_file (str): output file path
+
+    if memory_map:
+        # dataset break up per fea
+        # tar_fea = 1   # single target
+        den_fea = 13  # 13 dense  features
+        spa_fea = 26  # 26 sparse features
+        # tad_fea = tar_fea + den_fea
+        # tot_fea = tad_fea + spa_fea
+        # create offset per file
+        offset_per_file = np.array([0] + [x for x in total_per_file])
+        for i in range(days):
+            offset_per_file[i + 1] += offset_per_file[i]
+
+        '''
+        # Approach 1, 2 and 3 use indices, while Approach 4 does not use them
+        # create indices
+        indices = np.arange(total_count)
+        if data_split == "none":
+            if randomize == "total":
+                indices = np.random.permutation(indices)
+        else:
+            indices = np.array_split(indices, offset_per_file[1:-1])
+
+            # randomize train data (per day)
+            if randomize == "day":  # or randomize == "total":
+                for i in range(len(indices) - 1):
+                    indices[i] = np.random.permutation(indices[i])
+                print("Randomized indices per day ...")
+
+            train_indices = np.concatenate(indices[:-1])
+            test_indices = indices[-1]
+
+            # randomize train data (across days)
+            if randomize == "total":
+                train_indices = np.random.permutation(train_indices)
+                print("Randomized indices across days ...")
+
+            indices = np.concatenate((train_indices, test_indices))
+        # no reordering
+        # indices = np.arange(total_count)
+        '''
+        '''
+        # Approach 1: simple and slow (no grouping is used)
+        # check if data already exists
+        recreate_flag = False
+        for j in range(tot_fea):
+            filename_j = trafile + "_{0}_reordered.npy".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((total_count))
+            for j in range(tot_fea):
+                filename_j = trafile + "_{0}_reordered".format(j)
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat_t = np.transpose(data["X_cat"])
+                    X_int_t = np.transpose(data["X_int"])
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                # print(filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #     + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    if j < tar_fea:
+                        fj[indices[start:end]] = y
+                    elif tar_fea <= j and j < tad_fea:
+                        fj[indices[start:end]] = X_int_t[j - tar_fea, :]
+                    else:
+                        fj[indices[start:end]] = X_cat_t[j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing " + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                size = total_per_file[i]
+                X_int_t = np.zeros((den_fea, size))
+                X_cat_t = np.zeros((spa_fea, size))
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                print("Creating " + filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #     + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r')
+                    if j < tar_fea:
+                        y = fj[start:end]
+                    elif tar_fea <= j and j < tad_fea:
+                        X_int_t[j - tar_fea, :] = fj[start:end]
+                    else:
+                        X_cat_t[j - tad_fea, :] = fj[start:end]
+                    del fj
+
+                np.savez_compressed(
+                    filename_i,
+                    X_cat=np.transpose(X_cat_t),  # transpose of the data
+                    X_int=np.transpose(X_int_t),  # transpose of the data
+                    y=y,
+                )
+        else:
+            print("Reordered day files already exist, skipping ...")
+        '''
+        '''
+        # Approach 2: group days
+        # check if data already exists
+        recreate_flag = False
+        for j in range(tot_fea):
+            filename_j = trafile + "_{0}_reordered.npy".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((total_count))
+            for j in range(tot_fea):
+                filename_j = trafile + "_{0}_reordered".format(j)
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            group_day = 3  # e.g. 8, 4 or 3
+            group_num = days // group_day
+            file_group = [i*group_day for i in range(group_num)] + [days]
+            for ii in range(group_num):
+                # for last may be group_size != group_num, therefore reset it below
+                group_size = file_group[ii + 1] - file_group[ii]
+                X_cat_t = [0]*group_size
+                X_int_t = [0]*group_size
+                y = [0]*group_size
+                start = [0]*group_size
+                end  = [0]*group_size
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                    # setup start and end ranges
+                    start[ig] = offset_per_file[i]
+                    end[ig] = offset_per_file[i + 1]
+                    # print(filename_i)
+                    # load a group of files
+                    with np.load(filename_i) as data:
+                        X_cat_t[ig] = np.transpose(data["X_cat"])
+                        X_int_t[ig] = np.transpose(data["X_int"])
+                        y[ig] = data["y"]
+                    # sanity check
+                    if total_per_file[i] != len(y[ig]):
+                        sys.exit("ERROR: sanity check on number of samples failed")
+                # print("start=" + str(start) + " end=" + str(end)
+                #  + " diff=" + str(end[ig]-start[ig]) + "=" + str(total_per_file[i]))
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    for ig in range(group_size):
+                        if j < tar_fea:
+                            fj[indices[start[ig]:end[ig]]] = y[ig]
+                        elif tar_fea <= j and j < tad_fea:
+                            fj[indices[start[ig]:end[ig]]] = X_int_t[ig][j - tar_fea, :]
+                        else:
+                            fj[indices[start[ig]:end[ig]]] = X_cat_t[ig][j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing " + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for ii in range(group_num):
+                # for last may be group_size != group_num, therefore reset it below
+                group_size = file_group[ii + 1] - file_group[ii]
+                X_cat_t= []; X_int_t = []
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    X_int_t.append(np.zeros((den_fea, total_per_file[i])))
+                    X_cat_t.append(np.zeros((spa_fea, total_per_file[i])))
+                y = [0]*group_size
+                start = [0]*group_size
+                end  = [0]*group_size
+
+                for j in range(tot_fea):
+                    filename_j = trafile + "_{0}_reordered.npy".format(j)
+                    fj = np.load(filename_j, mmap_mode='r')
+                    # load a group of files
+                    for ig in range(group_size):
+                        i = file_group[ii] + ig
+                        # setup start and end ranges
+                        start[ig] = offset_per_file[i]
+                        end[ig] = offset_per_file[i + 1]
+                        # load data for the group of files
+                        if j < tar_fea:
+                            y[ig] = fj[start[ig]:end[ig]]
+                        elif tar_fea <= j and j < tad_fea:
+                            X_int_t[ig][j - tar_fea, :] = fj[start[ig]:end[ig]]
+                        else:
+                            X_cat_t[ig][j - tad_fea, :] = fj[start[ig]:end[ig]]
+                    del fj
+
+                for ig in range(group_size):
+                    i = file_group[ii] + ig
+                    filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                    print("Creating " + filename_i)
+                    np.savez_compressed(
+                        filename_i,
+                        X_cat=np.transpose(X_cat_t[ig]),  # transpose of the data
+                        X_int=np.transpose(X_int_t[ig]),  # transpose of the data
+                        y=y[ig],
+                    )
+        else:
+            print("Reordered day files already exist, skipping ...")
+        '''
+        '''
+        # Approach 3: group features
+        # check if data already exists
+        group_fea = 5  # e.g. 8, 5 or 4
+        group_num = tot_fea // group_fea
+        if tot_fea % group_fea != 0:  # sanity check
+            sys.exit("ERROR: the group_fea must divided tot_fea evenly.")
+        recreate_flag = False
+        for jn in range(group_num):
+            filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                jn, group_fea
+            )
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # load, reorder and concatenate data (memmap all reordered files per feature)
+        if recreate_flag:
+            # init reordered files (.npy appended automatically)
+            z = np.zeros((group_fea, total_count))
+            for jn in range(group_num):
+                filename_j = trafile + "_{0}_reordered{1}".format(
+                    jn, group_fea
+                )
+                np.save(filename_j, z)
+                print("Creating " + filename_j)
+
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat_t = np.transpose(data["X_cat"])
+                    X_int_t = np.transpose(data["X_int"])
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                # print(filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #      + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for jn in range(group_num):
+                    filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                        jn, group_fea
+                    )
+                    fj = np.load(filename_j, mmap_mode='r+')
+                    for jg in range(group_fea):
+                        j = jn * group_fea + jg
+                        # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
+                        if j < tar_fea:
+                            fj[jg, indices[start:end]] = y
+                        elif tar_fea <= j and j < tad_fea:
+                            fj[jg, indices[start:end]] = X_int_t[j - tar_fea, :]
+                        else:
+                            fj[jg, indices[start:end]] = X_cat_t[j - tad_fea, :]
+                    del fj
+        else:
+            print("Reordered fea files already exist, skipping ...")
+
+        # check if data already exists
+        recreate_flag = False
+        for i in range(days):
+            filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if path.exists(filename_i):
+                print("Using existing" + filename_i)
+            else:
+                recreate_flag = True
+        # split reordered data by files (memmap all reordered files per feature)
+        # on the day boundary del the file object and memmap again
+        if recreate_flag:
+            for i in range(days):
+                filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
+                size = total_per_file[i]
+                X_int_t = np.zeros((den_fea, size))
+                X_cat_t = np.zeros((spa_fea, size))
+                # setup start and end ranges
+                start = offset_per_file[i]
+                end = offset_per_file[i + 1]
+                print("Creating " + filename_i)
+                # print("start=" + str(start) + " end=" + str(end)
+                #      + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
+
+                for jn in range(group_num):
+                    filename_j = trafile + "_{0}_reordered{1}.npy".format(
+                        jn, group_fea
+                    )
+                    fj = np.load(filename_j, mmap_mode='r')
+                    for jg in range(group_fea):
+                        j = jn * group_fea + jg
+                        # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
+                        if j < tar_fea:
+                            y = fj[jg, start:end]
+                        elif tar_fea <= j and j < tad_fea:
+                            X_int_t[j - tar_fea, :] = fj[jg, start:end]
+                        else:
+                            X_cat_t[j - tad_fea, :] = fj[jg, start:end]
+                    del fj
+
+                np.savez_compressed(
+                    filename_i,
+                    X_cat=np.transpose(X_cat_t),  # transpose of the data
+                    X_int=np.transpose(X_int_t),  # transpose of the data
+                    y=y,
+                )
+
+        else:
+            print("Reordered day files already exist, skipping ...")
+        '''
+
+        # Approach 4: Fisher-Yates-Rao (FYR) shuffle algorithm
+        # 1st pass of FYR shuffle
+        # check if data already exists
+        recreate_flag = False
+        for j in range(days):
+            filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+            filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+            filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+            if (
+                path.exists(filename_j_y)
+                and path.exists(filename_j_d)
+                and path.exists(filename_j_s)
+            ):
+                print(
+                    "Using existing\n"
+                    + filename_j_y + "\n"
+                    + filename_j_d + "\n"
+                    + filename_j_s
+                )
+            else:
+                recreate_flag = True
+        # reorder across buckets using sampling
+        if recreate_flag:
+            # init intermediate files (.npy appended automatically)
+            for j in range(days):
+                filename_j_y = npzfile + "_{0}_intermediate_y".format(j)
+                filename_j_d = npzfile + "_{0}_intermediate_d".format(j)
+                filename_j_s = npzfile + "_{0}_intermediate_s".format(j)
+                np.save(filename_j_y, np.zeros((total_per_file[j])))
+                np.save(filename_j_d, np.zeros((total_per_file[j], den_fea)))
+                np.save(filename_j_s, np.zeros((total_per_file[j], spa_fea)))
+            # start processing files
+            total_counter = [0] * days
+            for i in range(days):
+                filename_i = npzfile + "_{0}_processed.npz".format(i)
+                with np.load(filename_i) as data:
+                    X_cat = data["X_cat"]
+                    X_int = data["X_int"]
+                    y = data["y"]
+                size = len(y)
+                # sanity check
+                if total_per_file[i] != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # debug prints
+                print("Reordering (1st pass) " + filename_i)
+
+                # create buckets using sampling of random ints
+                # from (discrete) uniform distribution
+                buckets = []
+                for _j in range(days):
+                    buckets.append([])
+                counter = [0] * days
+                days_to_sample = days if data_split == "none" else days - 1
+                if randomize == "total":
+                    rand_u = np.random.randint(low=0, high=days_to_sample, size=size)
+                    for k in range(size):
+                        # sample and make sure elements per buckets do not overflow
+                        if data_split == "none" or i < days - 1:
+                            # choose bucket
+                            p = rand_u[k]
+                            # retry of the bucket is full
+                            while total_counter[p] + counter[p] >= total_per_file[p]:
+                                p = np.random.randint(low=0, high=days_to_sample)
+                        else:  # preserve the last day/bucket if needed
+                            p = i
+                        buckets[p].append(k)
+                        counter[p] += 1
+                else:  # randomize is day or none
+                    for k in range(size):
+                        # do not sample, preserve the data in this bucket
+                        p = i
+                        buckets[p].append(k)
+                        counter[p] += 1
+
+                # sanity check
+                if np.sum(counter) != size:
+                    sys.exit("ERROR: sanity check on number of samples failed")
+                # debug prints
+                # print(counter)
+                # print(str(np.sum(counter)) + " = " + str(size))
+                # print([len(x) for x in buckets])
+                # print(total_counter)
+
+                # partially feel the buckets
+                for j in range(days):
+                    filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+                    filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+                    filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+                    start = total_counter[j]
+                    end = total_counter[j] + counter[j]
+                    # target buckets
+                    fj_y = np.load(filename_j_y, mmap_mode='r+')
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_y[start:end].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_y[start:end] = y[buckets[j]]
+                    del fj_y
+                    # dense buckets
+                    fj_d = np.load(filename_j_d, mmap_mode='r+')
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_d[start:end, :].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_d[start:end, :] = X_int[buckets[j], :]
+                    del fj_d
+                    # sparse buckets
+                    fj_s = np.load(filename_j_s, mmap_mode='r+')
+                    # print("start=" + str(start) + " end=" + str(end)
+                    #       + " end - start=" + str(end - start) + " "
+                    #       + str(fj_s[start:end, :].shape) + " "
+                    #       + str(len(buckets[j])))
+                    fj_s[start:end, :] = X_cat[buckets[j], :]
+                    del fj_s
+                    # update counters for next step
+                    total_counter[j] += counter[j]
+
+        # 2nd pass of FYR shuffle
+        # check if data already exists
+        for j in range(days):
+            filename_j = npzfile + "_{0}_reordered.npz".format(j)
+            if path.exists(filename_j):
+                print("Using existing " + filename_j)
+            else:
+                recreate_flag = True
+        # reorder within buckets
+        if recreate_flag:
+            for j in range(days):
+                filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
+                filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
+                filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
+                fj_y = np.load(filename_j_y)
+                fj_d = np.load(filename_j_d)
+                fj_s = np.load(filename_j_s)
+
+                indices = range(total_per_file[j])
+                if randomize == "day" or randomize == "total":
+                    if data_split == "none" or j < days - 1:
+                        indices = np.random.permutation(range(total_per_file[j]))
+
+                filename_r = npzfile + "_{0}_reordered.npz".format(j)
+                print("Reordering (2nd pass) " + filename_r)
+                np.savez_compressed(
+                    filename_r,
+                    X_cat=fj_s[indices, :],
+                    X_int=fj_d[indices, :],
+                    y=fj_y[indices],
+                )
+
+        '''
+        # sanity check (under no reordering norms should be zero)
+        for i in range(days):
+            filename_i_o = npzfile + "_{0}_processed.npz".format(i)
+            print(filename_i_o)
+            with np.load(filename_i_o) as data_original:
+                X_cat_o = data_original["X_cat"]
+                X_int_o = data_original["X_int"]
+                y_o = data_original["y"]
+            filename_i_r = npzfile + "_{0}_reordered.npz".format(i)
+            print(filename_i_r)
+            with np.load(filename_i_r) as data_reordered:
+                X_cat_r = data_reordered["X_cat"]
+                X_int_r = data_reordered["X_int"]
+                y_r = data_reordered["y"]
+            print(np.linalg.norm(y_o - y_r))
+            print(np.linalg.norm(X_int_o - X_int_r))
+            print(np.linalg.norm(X_cat_o - X_cat_r))
+        '''
+
+    else:
+        print("Concatenating multiple days into %s.npz file" % str(d_path + o_filename))
+
+        # load and concatenate data
+        for i in range(days):
+            filename_i = npzfile + "_{0}_processed.npz".format(i)
+            with np.load(filename_i) as data:
+                if i == 0:
+                    X_cat = data["X_cat"]
+                    X_int = data["X_int"]
+                    y = data["y"]
+                else:
+                    X_cat = np.concatenate((X_cat, data["X_cat"]))
+                    X_int = np.concatenate((X_int, data["X_int"]))
+                    y = np.concatenate((y, data["y"]))
+            print("Loaded day:", i, "y = 1:", len(y[y == 1]), "y = 0:", len(y[y == 0]))
+
+        with np.load(d_path + d_file + "_fea_count.npz") as data:
+            counts = data["counts"]
+        print("Loaded counts!")
+
+        np.savez_compressed(
+            d_path + o_filename + ".npz",
+            X_cat=X_cat,
+            X_int=X_int,
+            y=y,
+            counts=counts,
+        )
+
+    return d_path + o_filename + ".npz"
+
+
+def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file):
+    # Transforms Criteo Kaggle or terabyte data by applying log transformation
+    # on dense features and converting everything to appropriate tensors.
+    #
+    # Inputs:
+    #     X_cat (ndarray): array of integers corresponding to preprocessed
+    #                      categorical features
+    #     X_int (ndarray): array of integers corresponding to dense features
+    #     y (ndarray):     array of bool corresponding to labels
+    #     data_split(str): flag for splitting dataset into training/validation/test
+    #                      sets
+    #     randomize (str): determines randomization scheme
+    #         "none": no randomization
+    #         "day": randomizes each day"s data (only works if split = True)
+    #         "total": randomizes total dataset
+    #
+    # Outputs:
+    #     if split:
+    #         X_cat_train (tensor): sparse features for training set
+    #         X_int_train (tensor): dense features for training set
+    #         y_train (tensor): labels for training set
+    #         X_cat_val (tensor): sparse features for validation set
+    #         X_int_val (tensor): dense features for validation set
+    #         y_val (tensor): labels for validation set
+    #         X_cat_test (tensor): sparse features for test set
+    #         X_int_test (tensor): dense features for test set
+    #         y_test (tensor): labels for test set
+    #     else:
+    #         X_cat (tensor): sparse features
+    #         X_int (tensor): dense features
+    #         y (tensor): label
+
+    # define initial set of indices
+    indices = np.arange(len(y))
+
+    # create offset per file
+    offset_per_file = np.array([0] + [x for x in total_per_file])
+    for i in range(days):
+        offset_per_file[i + 1] += offset_per_file[i]
+
+    # split dataset
+    if data_split == 'train':
+        indices = np.array_split(indices, offset_per_file[1:-1])
+
+        # randomize train data (per day)
+        if randomize == "day":  # or randomize == "total":
+            for i in range(len(indices) - 1):
+                indices[i] = np.random.permutation(indices[i])
+            print("Randomized indices per day ...")
+
+        train_indices = np.concatenate(indices[:-1])
+        test_indices = indices[-1]
+        test_indices, val_indices = np.array_split(test_indices, 2)
+
+        print("Defined training and testing indices...")
+
+        # randomize train data (across days)
+        if randomize == "total":
+            train_indices = np.random.permutation(train_indices)
+            print("Randomized indices across days ...")
+
+        # indices = np.concatenate((train_indices, test_indices))
+
+        # create training, validation, and test sets
+        X_cat_train = X_cat[train_indices]
+        X_int_train = X_int[train_indices]
+        y_train = y[train_indices]
+
+        X_cat_val = X_cat[val_indices]
+        X_int_val = X_int[val_indices]
+        y_val = y[val_indices]
+
+        X_cat_test = X_cat[test_indices]
+        X_int_test = X_int[test_indices]
+        y_test = y[test_indices]
+
+        print("Split data according to indices...")
+
+        X_cat_train = X_cat_train.astype(np.long)
+        X_int_train = np.log(X_int_train.astype(np.float32) + 1)
+        y_train = y_train.astype(np.float32)
+
+        X_cat_val = X_cat_val.astype(np.long)
+        X_int_val = np.log(X_int_val.astype(np.float32) + 1)
+        y_val = y_val.astype(np.float32)
+
+        X_cat_test = X_cat_test.astype(np.long)
+        X_int_test = np.log(X_int_test.astype(np.float32) + 1)
+        y_test = y_test.astype(np.float32)
+
+        print("Converted to tensors...done!")
+
+        return (
+            X_cat_train,
+            X_int_train,
+            y_train,
+            X_cat_val,
+            X_int_val,
+            y_val,
+            X_cat_test,
+            X_int_test,
+            y_test,
+        )
+
+    else:
+
+        # randomize data
+        if randomize == "total":
+            indices = np.random.permutation(indices)
+            print("Randomized indices...")
+
+        X_cat = X_cat[indices].astype(np.long)
+        X_int = np.log(X_int[indices].astype(np.float32) + 1)
+        y = y[indices].astype(np.float32)
+
+        print("Converted to tensors...done!")
+
+        return (X_cat, X_int, y, [], [], [], [], [], [])
+
+
+def getCriteoAdData(
+        datafile,
+        o_filename,
+        max_ind_range=-1,
+        sub_sample_rate=0.0,
+        days=7,
+        data_split='train',
+        randomize='total',
+        criteo_kaggle=True,
+        memory_map=False,
+        dataset_multiprocessing=False,
+):
+    # Passes through entire dataset and defines dictionaries for categorical
+    # features and determines the number of total categories.
+    #
+    # Inputs:
+    #    datafile : path to downloaded raw data file
+    #    o_filename (str): saves results under o_filename if filename is not ""
+    #
+    # Output:
+    #   o_file (str): output file path
+
+    #split the datafile into path and filename
+    lstr = datafile.split("/")
+    d_path = "/".join(lstr[0:-1]) + "/"
+    d_file = lstr[-1].split(".")[0] if criteo_kaggle else lstr[-1]
+    npzfile = d_path + ((d_file + "_day") if criteo_kaggle else d_file)
+    trafile = d_path + ((d_file + "_fea") if criteo_kaggle else "fea")
+
+    # count number of datapoints in training set
+    total_file = d_path + d_file + "_day_count.npz"
+    if path.exists(total_file):
+        with np.load(total_file) as data:
+            total_per_file = list(data["total_per_file"])
+        total_count = np.sum(total_per_file)
+        print("Skipping counts per file (already exist)")
+    else:
+        total_count = 0
+        total_per_file = []
+        if criteo_kaggle:
+            # WARNING: The raw data consists of a single train.txt file
+            # Each line in the file is a sample, consisting of 13 continuous and
+            # 26 categorical features (an extra space indicates that feature is
+            # missing and will be interpreted as 0).
+            if path.exists(datafile):
+                print("Reading data from path=%s" % (datafile))
+                with open(str(datafile)) as f:
+                    for _ in f:
+                        total_count += 1
+                total_per_file.append(total_count)
+                # reset total per file due to split
+                num_data_per_split, extras = divmod(total_count, days)
+                total_per_file = [num_data_per_split] * days
+                for j in range(extras):
+                    total_per_file[j] += 1
+                # split into days (simplifies code later on)
+                file_id = 0
+                boundary = total_per_file[file_id]
+                nf = open(npzfile + "_" + str(file_id), "w")
+                with open(str(datafile)) as f:
+                    for j, line in enumerate(f):
+                        if j == boundary:
+                            nf.close()
+                            file_id += 1
+                            nf = open(npzfile + "_" + str(file_id), "w")
+                            boundary += total_per_file[file_id]
+                        nf.write(line)
+                nf.close()
+            else:
+                sys.exit("ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset")
+        else:
+            # WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files
+            # Each line in the file is a sample, consisting of 13 continuous and
+            # 26 categorical features (an extra space indicates that feature is
+            # missing and will be interpreted as 0).
+            for i in range(days):
+                datafile_i = datafile + "_" + str(i)  # + ".gz"
+                if path.exists(str(datafile_i)):
+                    print("Reading data from path=%s" % (str(datafile_i)))
+                    # file day_<number>
+                    total_per_file_count = 0
+                    with open(str(datafile_i)) as f:
+                        for _ in f:
+                            total_per_file_count += 1
+                    total_per_file.append(total_per_file_count)
+                    total_count += total_per_file_count
+                else:
+                    sys.exit("ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs")
+
+    # process a file worth of data and reinitialize data
+    # note that a file main contain a single or multiple splits
+    def process_one_file(
+            datfile,
+            npzfile,
+            split,
+            num_data_in_split,
+            dataset_multiprocessing,
+            convertDictsDay=None,
+            resultDay=None
+    ):
+        if dataset_multiprocessing:
+            convertDicts_day = [{} for _ in range(26)]
+
+        with open(str(datfile)) as f:
+            y = np.zeros(num_data_in_split, dtype="i4")  # 4 byte int
+            X_int = np.zeros((num_data_in_split, 13), dtype="i4")  # 4 byte int
+            X_cat = np.zeros((num_data_in_split, 26), dtype="i4")  # 4 byte int
+            if sub_sample_rate == 0.0:
+                rand_u = 1.0
+            else:
+                rand_u = np.random.uniform(low=0.0, high=1.0, size=num_data_in_split)
+
+            i = 0
+            percent = 0
+            for k, line in enumerate(f):
+                # process a line (data point)
+                line = line.split('\t')
+                # set missing values to zero
+                for j in range(len(line)):
+                    if (line[j] == '') or (line[j] == '\n'):
+                        line[j] = '0'
+                # sub-sample data by dropping zero targets, if needed
+                target = np.int32(line[0])
+                if target == 0 and \
+                   (rand_u if sub_sample_rate == 0.0 else rand_u[k]) < sub_sample_rate:
+                    continue
+
+                y[i] = target
+                X_int[i] = np.array(line[1:14], dtype=np.int32)
+                if max_ind_range > 0:
+                    X_cat[i] = np.array(
+                        list(map(lambda x: int(x, 16) % max_ind_range, line[14:])),
+                        dtype=np.int32
+                    )
+                else:
+                    X_cat[i] = np.array(
+                        list(map(lambda x: int(x, 16), line[14:])),
+                        dtype=np.int32
+                    )
+
+                # count uniques
+                if dataset_multiprocessing:
+                    for j in range(26):
+                        convertDicts_day[j][X_cat[i][j]] = 1
+                    # debug prints
+                    if float(i)/num_data_in_split*100 > percent+1:
+                        percent = int(float(i)/num_data_in_split*100)
+                        print(
+                            "Load %d/%d (%d%%) Split: %d  Label True: %d  Stored: %d"
+                            % (
+                                i,
+                                num_data_in_split,
+                                percent,
+                                split,
+                                target,
+                                y[i],
+                            ),
+                            end="\n",
+                        )
+                else:
+                    for j in range(26):
+                        convertDicts[j][X_cat[i][j]] = 1
+                    # debug prints
+                    print(
+                        "Load %d/%d  Split: %d  Label True: %d  Stored: %d"
+                        % (
+                            i,
+                            num_data_in_split,
+                            split,
+                            target,
+                            y[i],
+                        ),
+                        end="\r",
+                    )
+                i += 1
+
+            # store num_data_in_split samples or extras at the end of file
+            # count uniques
+            # X_cat_t  = np.transpose(X_cat)
+            # for j in range(26):
+            #     for x in X_cat_t[j,:]:
+            #         convertDicts[j][x] = 1
+            # store parsed
+            filename_s = npzfile + "_{0}.npz".format(split)
+            if path.exists(filename_s):
+                print("\nSkip existing " + filename_s)
+            else:
+                np.savez_compressed(
+                    filename_s,
+                    X_int=X_int[0:i, :],
+                    # X_cat=X_cat[0:i, :],
+                    X_cat_t=np.transpose(X_cat[0:i, :]),  # transpose of the data
+                    y=y[0:i],
+                )
+                print("\nSaved " + npzfile + "_{0}.npz!".format(split))
+
+        if dataset_multiprocessing:
+            resultDay[split] = i
+            convertDictsDay[split] = convertDicts_day
+            return
+        else:
+            return i
+
+    # create all splits (reuse existing files if possible)
+    recreate_flag = False
+    convertDicts = [{} for _ in range(26)]
+    # WARNING: to get reproducable sub-sampling results you must reset the seed below
+    # np.random.seed(123)
+    # in this case there is a single split in each day
+    for i in range(days):
+        npzfile_i = npzfile + "_{0}.npz".format(i)
+        npzfile_p = npzfile + "_{0}_processed.npz".format(i)
+        if path.exists(npzfile_i):
+            print("Skip existing " + npzfile_i)
+        elif path.exists(npzfile_p):
+            print("Skip existing " + npzfile_p)
+        else:
+            recreate_flag = True
+
+    if recreate_flag:
+        if dataset_multiprocessing:
+            resultDay = Manager().dict()
+            convertDictsDay = Manager().dict()
+            processes = [Process(target=process_one_file,
+                                 name="process_one_file:%i" % i,
+                                 args=(npzfile + "_{0}".format(i),
+                                       npzfile,
+                                       i,
+                                       total_per_file[i],
+                                       dataset_multiprocessing,
+                                       convertDictsDay,
+                                       resultDay,
+                                       )
+                                 ) for i in range(0, days)]
+            for process in processes:
+                process.start()
+            for process in processes:
+                process.join()
+            for day in range(days):
+                total_per_file[day] = resultDay[day]
+                print("Constructing convertDicts Split: {}".format(day))
+                convertDicts_tmp = convertDictsDay[day]
+                for i in range(26):
+                    for j in convertDicts_tmp[i]:
+                        convertDicts[i][j] = 1
+        else:
+            for i in range(days):
+                total_per_file[i] = process_one_file(
+                    npzfile + "_{0}".format(i),
+                    npzfile,
+                    i,
+                    total_per_file[i],
+                    dataset_multiprocessing,
+                )
+
+    # report and save total into a file
+    total_count = np.sum(total_per_file)
+    if not path.exists(total_file):
+        np.savez_compressed(total_file, total_per_file=total_per_file)
+    print("Total number of samples:", total_count)
+    print("Divided into days/splits:\n", total_per_file)
+
+    # dictionary files
+    counts = np.zeros(26, dtype=np.int32)
+    if recreate_flag:
+        # create dictionaries
+        for j in range(26):
+            for i, x in enumerate(convertDicts[j]):
+                convertDicts[j][x] = i
+            dict_file_j = d_path + d_file + "_fea_dict_{0}.npz".format(j)
+            if not path.exists(dict_file_j):
+                np.savez_compressed(
+                    dict_file_j,
+                    unique=np.array(list(convertDicts[j]), dtype=np.int32)
+                )
+            counts[j] = len(convertDicts[j])
+        # store (uniques and) counts
+        count_file = d_path + d_file + "_fea_count.npz"
+        if not path.exists(count_file):
+            np.savez_compressed(count_file, counts=counts)
+    else:
+        # create dictionaries (from existing files)
+        for j in range(26):
+            with np.load(d_path + d_file + "_fea_dict_{0}.npz".format(j)) as data:
+                unique = data["unique"]
+            for i, x in enumerate(unique):
+                convertDicts[j][x] = i
+        # load (uniques and) counts
+        with np.load(d_path + d_file + "_fea_count.npz") as data:
+            counts = data["counts"]
+
+    # process all splits
+    if dataset_multiprocessing:
+        processes = [Process(target=processCriteoAdData,
+                           name="processCriteoAdData:%i" % i,
+                           args=(d_path,
+                                 d_file,
+                                 npzfile,
+                                 i,
+                                 convertDicts,
+                                 counts,
+                                 )
+                           ) for i in range(0, days)]
+        for process in processes:
+            process.start()
+        for process in processes:
+            process.join()
+
+    else:
+        for i in range(days):
+            processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, counts)
+
+    o_file = concatCriteoAdData(
+        d_path,
+        d_file,
+        npzfile,
+        trafile,
+        days,
+        data_split,
+        randomize,
+        total_per_file,
+        total_count,
+        memory_map,
+        o_filename
+    )
+
+    return o_file
+
+
+def loadDataset(
+        dataset,
+        max_ind_range,
+        sub_sample_rate,
+        randomize,
+        data_split,
+        raw_path="",
+        pro_data="",
+        memory_map=False
+):
+    # dataset
+    if dataset == "kaggle":
+        days = 7
+        o_filename = "kaggleAdDisplayChallenge_processed"
+    elif dataset == "terabyte":
+        days = 24
+        o_filename = "terabyte_processed"
+    else:
+        raise(ValueError("Data set option is not supported"))
+
+    # split the datafile into path and filename
+    lstr = raw_path.split("/")
+    d_path = "/".join(lstr[0:-1]) + "/"
+    d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+    npzfile = (d_file + "_day") if dataset == "kaggle" else d_file
+    # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
+
+    # check if pre-processed data is available
+    data_ready = True
+    if memory_map:
+        for i in range(days):
+            reo_data = d_path + npzfile + "_{0}_reordered.npz".format(i)
+            if not path.exists(str(reo_data)):
+                data_ready = False
+    else:
+        if not path.exists(str(pro_data)):
+            data_ready = False
+
+    # pre-process data if needed
+    # WARNNING: when memory mapping is used we get a collection of files
+    if data_ready:
+        print("Reading pre-processed data=%s" % (str(pro_data)))
+        file = str(pro_data)
+    else:
+        print("Reading raw data=%s" % (str(raw_path)))
+        file = getCriteoAdData(
+            raw_path,
+            o_filename,
+            max_ind_range,
+            sub_sample_rate,
+            days,
+            data_split,
+            randomize,
+            dataset == "kaggle",
+            memory_map
+        )
+
+    return file, days
+
+
+if __name__ == "__main__":
+    ### import packages ###
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Preprocess Criteo dataset"
+    )
+    # model related parameters
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    args = parser.parse_args()
+
+    loadDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map
+    )
diff --git a/benchmarks/dlrm/ootb/dlrm_data_caffe2.py b/benchmarks/dlrm/ootb/dlrm_data_caffe2.py
new file mode 100644
index 0000000..0bda2ac
--- /dev/null
+++ b/benchmarks/dlrm/ootb/dlrm_data_caffe2.py
@@ -0,0 +1,843 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: generate inputs and targets for the dlrm benchmark
+# The inpts and outputs are generated according to the following three option(s)
+# 1) random distribution
+# 2) synthetic distribution, based on unique accesses and distances between them
+#    i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven
+#    Simulation of Cache Memory", IEEE AINAM'07
+# 3) public data set
+#    i)  Criteo Kaggle Display Advertising Challenge Dataset
+#    https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
+#    ii) Criteo Terabyte Dataset
+#    https://labs.criteo.com/2013/12/download-terabyte-click-logs
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import bisect
+import collections
+
+# others
+# from os import path
+import sys
+
+import data_utils
+
+# numpy
+import numpy as np
+
+# pytorch
+import torch
+from numpy import random as ra
+from torch.utils.data import Dataset
+
+
+# Kaggle Display Advertising Challenge Dataset
+# dataset (str): name of dataset (Kaggle or Terabyte)
+# randomize (str): determines randomization scheme
+#            'none': no randomization
+#            'day': randomizes each day's data (only works if split = True)
+#            'total': randomizes total dataset
+# split (bool) : to split into train, test, validation data-sets
+
+
+class CriteoDatasetWMemoryMap(Dataset):
+    def __init__(
+        self,
+        dataset,
+        max_ind_range,
+        sub_sample_rate,
+        randomize,
+        split="train",
+        raw_path="",
+        pro_data="",
+    ):
+        # dataset
+        # tar_fea = 1   # single target
+        den_fea = 13  # 13 dense  features
+        # spa_fea = 26  # 26 sparse features
+        # tad_fea = tar_fea + den_fea
+        # tot_fea = tad_fea + spa_fea
+        if dataset == "kaggle":
+            days = 7
+        elif dataset == "terabyte":
+            days = 24
+        else:
+            raise (ValueError("Data set option is not supported"))
+        self.max_ind_range = max_ind_range
+
+        # split the datafile into path and filename
+        lstr = raw_path.split("/")
+        self.d_path = "/".join(lstr[0:-1]) + "/"
+        self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+        self.npzfile = self.d_path + (
+            (self.d_file + "_day") if dataset == "kaggle" else self.d_file
+        )
+        self.trafile = self.d_path + (
+            (self.d_file + "_fea") if dataset == "kaggle" else "fea"
+        )
+
+        # get a number of samples per day
+        total_file = self.d_path + self.d_file + "_day_count.npz"
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"]
+        # compute offsets per file
+        self.offset_per_file = np.array([0] + list(total_per_file))
+        for i in range(days):
+            self.offset_per_file[i + 1] += self.offset_per_file[i]
+        # print(self.offset_per_file)
+
+        # setup data
+        self.split = split
+        if split == "none" or split == "train":
+            self.day = 0
+            self.max_day_range = days if split == "none" else days - 1
+        elif split == "test" or split == "val":
+            self.day = days - 1
+            num_samples = self.offset_per_file[days] - self.offset_per_file[days - 1]
+            self.test_size = int(np.ceil(num_samples / 2.0))
+            self.val_size = num_samples - self.test_size
+        else:
+            sys.exit("ERROR: dataset split is neither none, nor train or test.")
+
+        # load unique counts
+        with np.load(self.d_path + self.d_file + "_fea_count.npz") as data:
+            self.counts = data["counts"]
+        self.m_den = den_fea  # X_int.shape[1]
+        self.n_emb = len(self.counts)
+        print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den))
+
+        # Load the test data
+        # Only a single day is used for testing
+        if self.split == "test" or self.split == "val":
+            # only a single day is used for testing
+            fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
+            with np.load(fi) as data:
+                self.X_int = data["X_int"]  # continuous  feature
+                self.X_cat = data["X_cat"]  # categorical feature
+                self.y = data["y"]  # target
+
+    def __getitem__(self, index):
+
+        if isinstance(index, slice):
+            return [
+                self[idx]
+                for idx in range(
+                    index.start or 0, index.stop or len(self), index.step or 1
+                )
+            ]
+        if self.split == "none" or self.split == "train":
+            # check if need to swicth to next day and load data
+            if index == self.offset_per_file[self.day]:
+                # print("day_boundary switch", index)
+                self.day_boundary = self.offset_per_file[self.day]
+                fi = self.npzfile + "_{0}_reordered.npz".format(self.day)
+                # print('Loading file: ', fi)
+                with np.load(fi) as data:
+                    self.X_int = data["X_int"]  # continuous  feature
+                    self.X_cat = data["X_cat"]  # categorical feature
+                    self.y = data["y"]  # target
+                self.day = (self.day + 1) % self.max_day_range
+
+            i = index - self.day_boundary
+        elif self.split == "test" or self.split == "val":
+            # only a single day is used for testing
+            i = index + (0 if self.split == "test" else self.test_size)
+        else:
+            sys.exit("ERROR: dataset split is neither none, nor train or test.")
+
+        if self.max_ind_range > 0:
+            return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i]
+        else:
+            return self.X_int[i], self.X_cat[i], self.y[i]
+
+    def _default_preprocess(self, X_int, X_cat, y):
+        X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1)
+        if self.max_ind_range > 0:
+            X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long)
+        else:
+            X_cat = torch.tensor(X_cat, dtype=torch.long)
+        y = torch.tensor(y.astype(np.float32))
+
+        return X_int, X_cat, y
+
+    def __len__(self):
+        if self.split == "none":
+            return self.offset_per_file[-1]
+        elif self.split == "train":
+            return self.offset_per_file[-2]
+        elif self.split == "test":
+            return self.test_size
+        elif self.split == "val":
+            return self.val_size
+        else:
+            sys.exit("ERROR: dataset split is neither none, nor train nor test.")
+
+
+def collate_wrapper_criteo(list_of_tuples):
+    # where each tuple is (X_int, X_cat, y)
+    transposed_data = list(zip(*list_of_tuples))
+    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
+    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
+    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+
+    batchSize = X_cat.shape[0]
+    featureCnt = X_cat.shape[1]
+
+    lS_i = [X_cat[:, i] for i in range(featureCnt)]
+    lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
+
+    return X_int, torch.stack(lS_o), torch.stack(lS_i), T
+
+
+# Conversion from offset to length
+def offset_to_length_convertor(lS_o, lS_i):
+    def diff(tensor):
+        return tensor[1:] - tensor[:-1]
+
+    return torch.stack(
+        [
+            diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int())
+            for ind, S_o in enumerate(lS_o)
+        ]
+    )
+
+
+def unpack_batch(b, data_gen, data_set):
+    return b[0], b[1], b[2], b[3], torch.ones(b[3].size())
+
+
+def read_dataset(
+    dataset,
+    max_ind_range,
+    sub_sample_rate,
+    mini_batch_size,
+    num_batches,
+    randomize,
+    split="train",
+    raw_data="",
+    processed_data="",
+    memory_map=False,
+    inference_only=False,
+    test_mini_batch_size=1,
+):
+    # split the datafile into path and filename
+    lstr = raw_data.split("/")
+    d_path = "/".join(lstr[0:-1]) + "/"
+    d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+    # npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file)
+    # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
+
+    # load
+    print("Loading %s dataset..." % dataset)
+    nbatches = 0
+    file, days = data_utils.loadDataset(
+        dataset,
+        max_ind_range,
+        sub_sample_rate,
+        randomize,
+        split,
+        raw_data,
+        processed_data,
+        memory_map,
+    )
+
+    if memory_map:
+        # WARNING: at this point the data has been reordered and shuffled across files
+        # e.g. day_<number>_reordered.npz, what remains is simply to read and feed
+        # the data from each file, going in the order of days file-by-file, to the
+        # model during training.
+        train_data = CriteoDatasetWMemoryMap(
+            dataset,
+            max_ind_range,
+            sub_sample_rate,
+            randomize,
+            "train",
+            raw_data,
+            processed_data,
+        )
+
+        test_data = CriteoDatasetWMemoryMap(
+            dataset,
+            max_ind_range,
+            sub_sample_rate,
+            randomize,
+            "test",
+            raw_data,
+            processed_data,
+        )
+
+        train_loader = torch.utils.data.DataLoader(
+            train_data,
+            batch_size=mini_batch_size,
+            shuffle=False,
+            num_workers=0,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+        test_loader = torch.utils.data.DataLoader(
+            test_data,
+            batch_size=test_mini_batch_size,
+            shuffle=False,
+            num_workers=0,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+        return train_data, train_loader, test_data, test_loader
+
+    else:
+        # load and preprocess data
+        with np.load(file) as data:
+            X_int = data["X_int"]
+            X_cat = data["X_cat"]
+            y = data["y"]
+            counts = data["counts"]
+
+        # get a number of samples per day
+        total_file = d_path + d_file + "_day_count.npz"
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"]
+
+        # transform
+        (
+            X_cat_train,
+            X_int_train,
+            y_train,
+            X_cat_val,
+            X_int_val,
+            y_val,
+            X_cat_test,
+            X_int_test,
+            y_test,
+        ) = data_utils.transformCriteoAdData(
+            X_cat, X_int, y, days, split, randomize, total_per_file
+        )
+        ln_emb = counts
+        m_den = X_int_train.shape[1]
+        n_emb = len(counts)
+        print("Sparse features = %d, Dense features = %d" % (n_emb, m_den))
+
+        # adjust parameters
+        def assemble_samples(X_cat, X_int, y, max_ind_range, print_message):
+            if max_ind_range > 0:
+                X_cat = X_cat % max_ind_range
+
+            nsamples = len(y)
+            data_size = nsamples
+            # using floor is equivalent to dropping last mini-batch (drop_last = True)
+            nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
+            print(print_message)
+            if num_batches != 0 and num_batches < nbatches:
+                print(
+                    "Limiting to %d batches of the total % d batches"
+                    % (num_batches, nbatches)
+                )
+                nbatches = num_batches
+            else:
+                print("Total number of batches %d" % nbatches)
+
+            # data main loop
+            lX = []
+            lS_lengths = []
+            lS_indices = []
+            lT = []
+            for j in range(0, nbatches):
+                # number of data points in a batch
+                print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
+                n = min(mini_batch_size, data_size - (j * mini_batch_size))
+                # dense feature
+                idx_start = j * mini_batch_size
+                lX.append((X_int[idx_start : (idx_start + n)]).astype(np.float32))
+                # Targets - outputs
+                lT.append(
+                    (y[idx_start : idx_start + n]).reshape(-1, 1).astype(np.int32)
+                )
+                # sparse feature (sparse indices)
+                lS_emb_indices = []
+                # for each embedding generate a list of n lookups,
+                # where each lookup is composed of multiple sparse indices
+                for size in range(n_emb):
+                    lS_batch_indices = []
+                    for _b in range(n):
+                        # num of sparse indices to be used per embedding, e.g. for
+                        # store lengths and indices
+                        lS_batch_indices += (
+                            (X_cat[idx_start + _b][size].reshape(-1)).astype(np.int32)
+                        ).tolist()
+                    lS_emb_indices.append(lS_batch_indices)
+                lS_indices.append(lS_emb_indices)
+                # Criteo Kaggle data it is 1 because data is categorical
+                lS_lengths.append(
+                    [(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]
+                )
+            print("\n")
+
+            return nbatches, lX, lS_lengths, lS_indices, lT
+
+        # adjust training data
+        (nbatches, lX, lS_lengths, lS_indices, lT) = assemble_samples(
+            X_cat_train, X_int_train, y_train, max_ind_range, "Training data"
+        )
+
+        # adjust testing data
+        (nbatches_t, lX_t, lS_lengths_t, lS_indices_t, lT_t) = assemble_samples(
+            X_cat_test, X_int_test, y_test, max_ind_range, "Testing data"
+        )
+    # end if memory_map
+
+    return (
+        nbatches,
+        lX,
+        lS_lengths,
+        lS_indices,
+        lT,
+        nbatches_t,
+        lX_t,
+        lS_lengths_t,
+        lS_indices_t,
+        lT_t,
+        ln_emb,
+        m_den,
+    )
+
+
+def generate_random_data(
+    m_den,
+    ln_emb,
+    data_size,
+    num_batches,
+    mini_batch_size,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    num_targets=1,
+    round_targets=False,
+    data_generation="random",
+    trace_file="",
+    enable_padding=False,
+):
+    nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
+    if num_batches != 0:
+        nbatches = num_batches
+        data_size = nbatches * mini_batch_size
+    # print("Total number of batches %d" % nbatches)
+
+    # inputs and targets
+    lT = []
+    lX = []
+    lS_lengths = []
+    lS_indices = []
+    for j in range(0, nbatches):
+        # number of data points in a batch
+        n = min(mini_batch_size, data_size - (j * mini_batch_size))
+
+        # generate a batch of dense and sparse features
+        if data_generation == "random":
+            (Xt, lS_emb_lengths, lS_emb_indices) = generate_uniform_input_batch(
+                m_den, ln_emb, n, num_indices_per_lookup, num_indices_per_lookup_fixed
+            )
+        elif data_generation == "synthetic":
+            (Xt, lS_emb_lengths, lS_emb_indices) = generate_synthetic_input_batch(
+                m_den,
+                ln_emb,
+                n,
+                num_indices_per_lookup,
+                num_indices_per_lookup_fixed,
+                trace_file,
+                enable_padding,
+            )
+        else:
+            sys.exit(
+                "ERROR: --data-generation=" + data_generation + " is not supported"
+            )
+        # dense feature
+        lX.append(Xt)
+        # sparse feature (sparse indices)
+        lS_lengths.append(lS_emb_lengths)
+        lS_indices.append(lS_emb_indices)
+
+        # generate a batch of target (probability of a click)
+        P = generate_random_output_batch(n, num_targets, round_targets)
+        lT.append(P)
+
+    return (nbatches, lX, lS_lengths, lS_indices, lT)
+
+
+def generate_random_output_batch(n, num_targets=1, round_targets=False):
+    # target (probability of a click)
+    if round_targets:
+        P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.int32)
+    else:
+        P = ra.rand(n, num_targets).astype(np.float32)
+
+    return P
+
+
+# uniform ditribution (input data)
+def generate_uniform_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+):
+    # dense feature
+    Xt = ra.rand(n, m_den).astype(np.float32)
+
+    # sparse feature (sparse indices)
+    lS_emb_lengths = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for size in ln_emb:
+        lS_batch_lengths = []
+        lS_batch_indices = []
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int32(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int32(
+                    max(1, np.round(r * min(size, num_indices_per_lookup))[0])
+                )
+            # sparse indices to be used per embedding
+            r = ra.random(sparse_group_size)
+            sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int32))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int32(sparse_group.size)
+            # store lengths and indices
+            lS_batch_lengths += [sparse_group_size]
+            lS_batch_indices += sparse_group.tolist()
+        lS_emb_lengths.append(lS_batch_lengths)
+        lS_emb_indices.append(lS_batch_indices)
+
+    return (Xt, lS_emb_lengths, lS_emb_indices)
+
+
+# synthetic distribution (input data)
+def generate_synthetic_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    trace_file,
+    enable_padding=False,
+):
+    # dense feature
+    Xt = ra.rand(n, m_den).astype(np.float32)
+
+    # sparse feature (sparse indices)
+    lS_emb_lengths = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for i, size in enumerate(ln_emb):
+        lS_batch_lengths = []
+        lS_batch_indices = []
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int32(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int32(
+                    max(1, np.round(r * min(size, num_indices_per_lookup))[0])
+                )
+            # sparse indices to be used per embedding
+            file_path = trace_file
+            line_accesses, list_sd, cumm_sd = read_dist_from_file(
+                file_path.replace("j", str(i))
+            )
+            # debug print
+            # print('input')
+            # print(line_accesses); print(list_sd); print(cumm_sd);
+            # print(sparse_group_size)
+            # approach 1: rand
+            # r = trace_generate_rand(
+            #     line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
+            # )
+            # approach 2: lru
+            r = trace_generate_lru(
+                line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
+            )
+            # WARNING: if the distribution in the file is not consistent with
+            # embedding table dimensions, below mod guards against out of
+            # range access
+            sparse_group = np.unique(r).astype(np.int32)
+            minsg = np.min(sparse_group)
+            maxsg = np.max(sparse_group)
+            if (minsg < 0) or (size <= maxsg):
+                print(
+                    "WARNING: distribution is inconsistent with embedding "
+                    + "table size (using mod to recover and continue)"
+                )
+                sparse_group = np.mod(sparse_group, size).astype(np.int32)
+            # sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int32))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int32(sparse_group.size)
+            # store lengths and indices
+            lS_batch_lengths += [sparse_group_size]
+            lS_batch_indices += sparse_group.tolist()
+        lS_emb_lengths.append(lS_batch_lengths)
+        lS_emb_indices.append(lS_batch_indices)
+
+    return (Xt, lS_emb_lengths, lS_emb_indices)
+
+
+def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False):
+    u = ra.rand(1)
+    if i < max_i:
+        # only generate stack distances up to the number of new references seen so far
+        j = bisect.bisect(cumm_val, i) - 1
+        fi = cumm_dist[j]
+        u *= fi  # shrink distribution support to exclude last values
+    elif enable_padding:
+        # WARNING: disable generation of new references (once all have been seen)
+        fi = cumm_dist[0]
+        u = (1.0 - fi) * u + fi  # remap distribution support to exclude first value
+
+    for (j, f) in enumerate(cumm_dist):
+        if u <= f:
+            return cumm_val[j]
+
+
+# WARNING: global define, must be consistent across all synthetic functions
+cache_line_size = 1
+
+
+def trace_generate_lru(
+    line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
+):
+    max_sd = list_sd[-1]
+    l = len(line_accesses)
+    i = 0
+    ztrace = []
+    for _ in range(out_trace_len):
+        sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
+        mem_ref_within_line = 0  # floor(ra.rand(1)*cache_line_size) #0
+        # generate memory reference
+        if sd == 0:  # new reference #
+            line_ref = line_accesses.pop(0)
+            line_accesses.append(line_ref)
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            i += 1
+        else:  # existing reference #
+            line_ref = line_accesses[l - sd]
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            line_accesses.pop(l - sd)
+            line_accesses.append(line_ref)
+        # save generated memory reference
+        ztrace.append(mem_ref)
+
+    return ztrace
+
+
+def trace_generate_rand(
+    line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
+):
+    max_sd = list_sd[-1]
+    l = len(line_accesses)  # !!!Unique,
+    i = 0
+    ztrace = []
+    for _ in range(out_trace_len):
+        sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
+        mem_ref_within_line = 0  # floor(ra.rand(1)*cache_line_size) #0
+        # generate memory reference
+        if sd == 0:  # new reference #
+            line_ref = line_accesses.pop(0)
+            line_accesses.append(line_ref)
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            i += 1
+        else:  # existing reference #
+            line_ref = line_accesses[l - sd]
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+        ztrace.append(mem_ref)
+
+    return ztrace
+
+
+def trace_profile(trace, enable_padding=False):
+    # number of elements in the array (assuming 1D)
+    # n = trace.size
+
+    rstack = []  # S
+    stack_distances = []  # SDS
+    line_accesses = []  # L
+    for x in trace:
+        r = np.uint64(x / cache_line_size)
+        l = len(rstack)
+        try:  # found #
+            i = rstack.index(r)
+            # WARNING: I believe below is the correct depth in terms of meaning of the
+            #          algorithm, but that is not what seems to be in the paper alg.
+            #          -1 can be subtracted if we defined the distance between
+            #          consecutive accesses (e.g. r, r) as 0 rather than 1.
+            sd = l - i  # - 1
+            # push r to the end of stack_distances
+            stack_distances.insert(0, sd)
+            # remove r from its position and insert to the top of stack
+            rstack.pop(i)  # rstack.remove(r)
+            rstack.insert(l - 1, r)
+        except ValueError:  # not found #
+            sd = 0  # -1
+            # push r to the end of stack_distances/line_accesses
+            stack_distances.insert(0, sd)
+            line_accesses.insert(0, r)
+            # push r to the top of stack
+            rstack.insert(l, r)
+
+    if enable_padding:
+        # WARNING: notice that as the ratio between the number of samples (l)
+        # and cardinality (c) of a sample increases the probability of
+        # generating a sample gets smaller and smaller because there are
+        # few new samples compared to repeated samples. This means that for a
+        # long trace with relatively small cardinality it will take longer to
+        # generate all new samples and therefore obtain full distribution support
+        # and hence it takes longer for distribution to resemble the original.
+        # Therefore, we may pad the number of new samples to be on par with
+        # average number of samples l/c artificially.
+        l = len(stack_distances)
+        c = max(stack_distances)
+        padding = int(np.ceil(l / c))
+        stack_distances = stack_distances + [0] * padding
+
+    return (rstack, stack_distances, line_accesses)
+
+
+# auxiliary read/write routines
+def read_trace_from_file(file_path):
+    try:
+        with open(file_path) as f:
+            if args.trace_file_binary_type:
+                array = np.fromfile(f, dtype=np.uint64)
+                trace = array.astype(np.uint64).tolist()
+            else:
+                line = f.readline()
+                trace = list(map(lambda x: np.uint64(x), line.split(", ")))
+            return trace
+    except Exception:
+        print("ERROR: no input trace file has been provided")
+
+
+def write_trace_to_file(file_path, trace):
+    try:
+        if args.trace_file_binary_type:
+            with open(file_path, "wb+") as f:
+                np.array(trace).astype(np.uint64).tofile(f)
+        else:
+            with open(file_path, "w+") as f:
+                s = str(trace)
+                f.write(s[1 : len(s) - 1])
+    except Exception:
+        print("ERROR: no output trace file has been provided")
+
+
+def read_dist_from_file(file_path):
+    try:
+        with open(file_path, "r") as f:
+            lines = f.read().splitlines()
+    except Exception:
+        print("Wrong file or file path")
+    # read unique accesses
+    unique_accesses = [int(el) for el in lines[0].split(", ")]
+    # read cumulative distribution (elements are passed as two separate lists)
+    list_sd = [int(el) for el in lines[1].split(", ")]
+    cumm_sd = [float(el) for el in lines[2].split(", ")]
+
+    return unique_accesses, list_sd, cumm_sd
+
+
+def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd):
+    try:
+        with open(file_path, "w") as f:
+            # unique_acesses
+            s = str(unique_accesses)
+            f.write(s[1 : len(s) - 1] + "\n")
+            # list_sd
+            s = str(list_sd)
+            f.write(s[1 : len(s) - 1] + "\n")
+            # cumm_sd
+            s = str(cumm_sd)
+            f.write(s[1 : len(s) - 1] + "\n")
+    except Exception:
+        print("Wrong file or file path")
+
+
+if __name__ == "__main__":
+    import sys
+    import operator
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(description="Generate Synthetic Distributions")
+    parser.add_argument("--trace-file", type=str, default="./input/trace.log")
+    parser.add_argument("--trace-file-binary-type", type=bool, default=False)
+    parser.add_argument("--trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--dist-file", type=str, default="./input/dist.log")
+    parser.add_argument(
+        "--synthetic-file", type=str, default="./input/trace_synthetic.log"
+    )
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--print-precision", type=int, default=5)
+    args = parser.parse_args()
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+
+    ### read trace ###
+    trace = read_trace_from_file(args.trace_file)
+    # print(trace)
+
+    ### profile trace ###
+    (_, stack_distances, line_accesses) = trace_profile(
+        trace, args.trace_enable_padding
+    )
+    stack_distances.reverse()
+    line_accesses.reverse()
+    # print(line_accesses)
+    # print(stack_distances)
+
+    ### compute probability distribution ###
+    # count items
+    l = len(stack_distances)
+    dc = sorted(
+        collections.Counter(stack_distances).items(), key=operator.itemgetter(0)
+    )
+
+    # create a distribution
+    list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc))  # x = tuple_x_k[0]
+    dist_sd = list(
+        map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc)
+    )  # k = tuple_x_k[1]
+    cumm_sd = []  # np.cumsum(dc).tolist() #prefixsum
+    for i, (_, k) in enumerate(dc):
+        if i == 0:
+            cumm_sd.append(k / float(l))
+        else:
+            # add the 2nd element of the i-th tuple in the dist_sd list
+            cumm_sd.append(cumm_sd[i - 1] + (k / float(l)))
+
+    ### write stack_distance and line_accesses to a file ###
+    write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd)
+
+    ### generate correspondinf synthetic ###
+    # line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file)
+    synthetic_trace = trace_generate_lru(
+        line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
+    )
+    # synthetic_trace = trace_generate_rand(
+    #     line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
+    # )
+    write_trace_to_file(args.synthetic_file, synthetic_trace)
diff --git a/benchmarks/dlrm/ootb/dlrm_data_pytorch.py b/benchmarks/dlrm/ootb/dlrm_data_pytorch.py
new file mode 100644
index 0000000..9c4fa89
--- /dev/null
+++ b/benchmarks/dlrm/ootb/dlrm_data_pytorch.py
@@ -0,0 +1,1309 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: generate inputs and targets for the dlrm benchmark
+# The inpts and outputs are generated according to the following three option(s)
+# 1) random distribution
+# 2) synthetic distribution, based on unique accesses and distances between them
+#    i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven
+#    Simulation of Cache Memory", IEEE AINAM'07
+# 3) public data set
+#    i)  Criteo Kaggle Display Advertising Challenge Dataset
+#    https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
+#    ii) Criteo Terabyte Dataset
+#    https://labs.criteo.com/2013/12/download-terabyte-click-logs
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# others
+from os import path
+import sys
+import functools
+import bisect
+import collections
+
+import data_utils
+
+# numpy
+import numpy as np
+from numpy import random as ra
+from collections import deque
+
+
+# pytorch
+import torch
+from torch.utils.data import Dataset, RandomSampler
+
+import data_loader_terabyte
+import mlperf_logger
+
+
+# Kaggle Display Advertising Challenge Dataset
+# dataset (str): name of dataset (Kaggle or Terabyte)
+# randomize (str): determines randomization scheme
+#            "none": no randomization
+#            "day": randomizes each day"s data (only works if split = True)
+#            "total": randomizes total dataset
+# split (bool) : to split into train, test, validation data-sets
+class CriteoDataset(Dataset):
+
+    def __init__(
+            self,
+            dataset,
+            max_ind_range,
+            sub_sample_rate,
+            randomize,
+            split="train",
+            raw_path="",
+            pro_data="",
+            memory_map=False,
+            dataset_multiprocessing=False,
+    ):
+        # dataset
+        # tar_fea = 1   # single target
+        den_fea = 13  # 13 dense  features
+        # spa_fea = 26  # 26 sparse features
+        # tad_fea = tar_fea + den_fea
+        # tot_fea = tad_fea + spa_fea
+        if dataset == "kaggle":
+            days = 7
+            out_file = "kaggleAdDisplayChallenge_processed"
+        elif dataset == "terabyte":
+            days = 24
+            out_file = "terabyte_processed"
+        else:
+            raise(ValueError("Data set option is not supported"))
+        self.max_ind_range = max_ind_range
+        self.memory_map = memory_map
+
+        # split the datafile into path and filename
+        lstr = raw_path.split("/")
+        self.d_path = "/".join(lstr[0:-1]) + "/"
+        self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
+        self.npzfile = self.d_path + (
+            (self.d_file + "_day") if dataset == "kaggle" else self.d_file
+        )
+        self.trafile = self.d_path + (
+            (self.d_file + "_fea") if dataset == "kaggle" else "fea"
+        )
+
+        # check if pre-processed data is available
+        data_ready = True
+        if memory_map:
+            for i in range(days):
+                reo_data = self.npzfile + "_{0}_reordered.npz".format(i)
+                if not path.exists(str(reo_data)):
+                    data_ready = False
+        else:
+            if not path.exists(str(pro_data)):
+                data_ready = False
+
+        # pre-process data if needed
+        # WARNNING: when memory mapping is used we get a collection of files
+        if data_ready:
+            print("Reading pre-processed data=%s" % (str(pro_data)))
+            file = str(pro_data)
+        else:
+            print("Reading raw data=%s" % (str(raw_path)))
+            file = data_utils.getCriteoAdData(
+                raw_path,
+                out_file,
+                max_ind_range,
+                sub_sample_rate,
+                days,
+                split,
+                randomize,
+                dataset == "kaggle",
+                memory_map,
+                dataset_multiprocessing,
+            )
+
+        # get a number of samples per day
+        total_file = self.d_path + self.d_file + "_day_count.npz"
+        with np.load(total_file) as data:
+            total_per_file = data["total_per_file"]
+        # compute offsets per file
+        self.offset_per_file = np.array([0] + [x for x in total_per_file])
+        for i in range(days):
+            self.offset_per_file[i + 1] += self.offset_per_file[i]
+        # print(self.offset_per_file)
+
+        # setup data
+        if memory_map:
+            # setup the training/testing split
+            self.split = split
+            if split == 'none' or split == 'train':
+                self.day = 0
+                self.max_day_range = days if split == 'none' else days - 1
+            elif split == 'test' or split == 'val':
+                self.day = days - 1
+                num_samples = self.offset_per_file[days] - \
+                              self.offset_per_file[days - 1]
+                self.test_size = int(np.ceil(num_samples / 2.))
+                self.val_size = num_samples - self.test_size
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train or test.")
+
+            '''
+            # text
+            print("text")
+            for i in range(days):
+                fi = self.npzfile + "_{0}".format(i)
+                with open(fi) as data:
+                    ttt = 0; nnn = 0
+                    for _j, line in enumerate(data):
+                        ttt +=1
+                        if np.int32(line[0]) > 0:
+                            nnn +=1
+                    print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                          + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            # processed
+            print("processed")
+            for i in range(days):
+                fi = self.npzfile + "_{0}_processed.npz".format(i)
+                with np.load(fi) as data:
+                    yyy = data["y"]
+                ttt = len(yyy)
+                nnn = np.count_nonzero(yyy)
+                print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                      + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            # reordered
+            print("reordered")
+            for i in range(days):
+                fi = self.npzfile + "_{0}_reordered.npz".format(i)
+                with np.load(fi) as data:
+                    yyy = data["y"]
+                ttt = len(yyy)
+                nnn = np.count_nonzero(yyy)
+                print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
+                      + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
+            '''
+
+            # load unique counts
+            with np.load(self.d_path + self.d_file + "_fea_count.npz") as data:
+                self.counts = data["counts"]
+            self.m_den = den_fea  # X_int.shape[1]
+            self.n_emb = len(self.counts)
+            print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den))
+
+            # Load the test data
+            # Only a single day is used for testing
+            if self.split == 'test' or self.split == 'val':
+                # only a single day is used for testing
+                fi = self.npzfile + "_{0}_reordered.npz".format(
+                    self.day
+                )
+                with np.load(fi) as data:
+                    self.X_int = data["X_int"]  # continuous  feature
+                    self.X_cat = data["X_cat"]  # categorical feature
+                    self.y = data["y"]          # target
+
+        else:
+            # load and preprocess data
+            with np.load(file) as data:
+                X_int = data["X_int"]  # continuous  feature
+                X_cat = data["X_cat"]  # categorical feature
+                y = data["y"]          # target
+                self.counts = data["counts"]
+            self.m_den = X_int.shape[1]  # den_fea
+            self.n_emb = len(self.counts)
+            print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den))
+
+            # create reordering
+            indices = np.arange(len(y))
+
+            if split == "none":
+                # randomize all data
+                if randomize == "total":
+                    indices = np.random.permutation(indices)
+                    print("Randomized indices...")
+
+                X_int[indices] = X_int
+                X_cat[indices] = X_cat
+                y[indices] = y
+
+            else:
+                indices = np.array_split(indices, self.offset_per_file[1:-1])
+
+                # randomize train data (per day)
+                if randomize == "day":  # or randomize == "total":
+                    for i in range(len(indices) - 1):
+                        indices[i] = np.random.permutation(indices[i])
+                    print("Randomized indices per day ...")
+
+                train_indices = np.concatenate(indices[:-1])
+                test_indices = indices[-1]
+                test_indices, val_indices = np.array_split(test_indices, 2)
+
+                print("Defined %s indices..." % (split))
+
+                # randomize train data (across days)
+                if randomize == "total":
+                    train_indices = np.random.permutation(train_indices)
+                    print("Randomized indices across days ...")
+
+                # create training, validation, and test sets
+                if split == 'train':
+                    self.X_int = [X_int[i] for i in train_indices]
+                    self.X_cat = [X_cat[i] for i in train_indices]
+                    self.y = [y[i] for i in train_indices]
+                elif split == 'val':
+                    self.X_int = [X_int[i] for i in val_indices]
+                    self.X_cat = [X_cat[i] for i in val_indices]
+                    self.y = [y[i] for i in val_indices]
+                elif split == 'test':
+                    self.X_int = [X_int[i] for i in test_indices]
+                    self.X_cat = [X_cat[i] for i in test_indices]
+                    self.y = [y[i] for i in test_indices]
+
+            print("Split data according to indices...")
+
+    def __getitem__(self, index):
+
+        if isinstance(index, slice):
+            return [
+                self[idx] for idx in range(
+                    index.start or 0, index.stop or len(self), index.step or 1
+                )
+            ]
+
+        if self.memory_map:
+            if self.split == 'none' or self.split == 'train':
+                # check if need to swicth to next day and load data
+                if index == self.offset_per_file[self.day]:
+                    # print("day_boundary switch", index)
+                    self.day_boundary = self.offset_per_file[self.day]
+                    fi = self.npzfile + "_{0}_reordered.npz".format(
+                        self.day
+                    )
+                    # print('Loading file: ', fi)
+                    with np.load(fi) as data:
+                        self.X_int = data["X_int"]  # continuous  feature
+                        self.X_cat = data["X_cat"]  # categorical feature
+                        self.y = data["y"]          # target
+                    self.day = (self.day + 1) % self.max_day_range
+
+                i = index - self.day_boundary
+            elif self.split == 'test' or self.split == 'val':
+                # only a single day is used for testing
+                i = index + (0 if self.split == 'test' else self.test_size)
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train or test.")
+        else:
+            i = index
+
+        if self.max_ind_range > 0:
+            return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i]
+        else:
+            return self.X_int[i], self.X_cat[i], self.y[i]
+
+    def _default_preprocess(self, X_int, X_cat, y):
+        X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1)
+        if self.max_ind_range > 0:
+            X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long)
+        else:
+            X_cat = torch.tensor(X_cat, dtype=torch.long)
+        y = torch.tensor(y.astype(np.float32))
+
+        return X_int, X_cat, y
+
+    def __len__(self):
+        if self.memory_map:
+            if self.split == 'none':
+                return self.offset_per_file[-1]
+            elif self.split == 'train':
+                return self.offset_per_file[-2]
+            elif self.split == 'test':
+                return self.test_size
+            elif self.split == 'val':
+                return self.val_size
+            else:
+                sys.exit("ERROR: dataset split is neither none, nor train nor test.")
+        else:
+            return len(self.y)
+
+
+def collate_wrapper_criteo_offset(list_of_tuples):
+    # where each tuple is (X_int, X_cat, y)
+    transposed_data = list(zip(*list_of_tuples))
+    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
+    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
+    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+
+    batchSize = X_cat.shape[0]
+    featureCnt = X_cat.shape[1]
+
+    lS_i = [X_cat[:, i] for i in range(featureCnt)]
+    lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
+
+    return X_int, torch.stack(lS_o), torch.stack(lS_i), T
+
+
+def ensure_dataset_preprocessed(args, d_path):
+    _ = CriteoDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "train",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+        args.dataset_multiprocessing
+    )
+
+    _ = CriteoDataset(
+        args.data_set,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.data_randomize,
+        "test",
+        args.raw_data_file,
+        args.processed_data_file,
+        args.memory_map,
+        args.dataset_multiprocessing
+    )
+
+    for split in ['train', 'val', 'test']:
+        print('Running preprocessing for split =', split)
+
+        train_files = ['{}_{}_reordered.npz'.format(args.raw_data_file, day)
+                       for
+                       day in range(0, 23)]
+
+        test_valid_file = args.raw_data_file + '_23_reordered.npz'
+
+        output_file = d_path + '_{}.bin'.format(split)
+
+        input_files = train_files if split == 'train' else [test_valid_file]
+        data_loader_terabyte.numpy_to_binary(input_files=input_files,
+                                             output_file_path=output_file,
+                                             split=split)
+
+
+# Conversion from offset to length
+def offset_to_length_converter(lS_o, lS_i):
+    def diff(tensor):
+        return tensor[1:] - tensor[:-1]
+
+    return torch.stack(
+        [
+            diff(torch.cat((S_o, torch.tensor(lS_i[ind].shape))).int())
+            for ind, S_o in enumerate(lS_o)
+        ]
+    )
+
+
+def collate_wrapper_criteo_length(list_of_tuples):
+    # where each tuple is (X_int, X_cat, y)
+    transposed_data = list(zip(*list_of_tuples))
+    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
+    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
+    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+
+    batchSize = X_cat.shape[0]
+    featureCnt = X_cat.shape[1]
+
+    lS_i = torch.stack([X_cat[:, i] for i in range(featureCnt)])
+    lS_o = torch.stack(
+        [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
+    )
+
+    lS_l = offset_to_length_converter(lS_o, lS_i)
+
+    return X_int, lS_l, lS_i, T
+
+
+def make_criteo_data_and_loaders(args, offset_to_length_converter=False):
+    if args.mlperf_logging and args.memory_map and args.data_set == "terabyte":
+        # more efficient for larger batches
+        data_directory = path.dirname(args.raw_data_file)
+
+        if args.mlperf_bin_loader:
+            lstr = args.processed_data_file.split("/")
+            d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0]
+            train_file = d_path + "_train.bin"
+            test_file = d_path + "_test.bin"
+            # val_file = d_path + "_val.bin"
+            counts_file = args.raw_data_file + '_fea_count.npz'
+
+            if any(not path.exists(p) for p in [train_file,
+                                                test_file,
+                                                counts_file]):
+                ensure_dataset_preprocessed(args, d_path)
+
+            train_data = data_loader_terabyte.CriteoBinDataset(
+                data_file=train_file,
+                counts_file=counts_file,
+                batch_size=args.mini_batch_size,
+                max_ind_range=args.max_ind_range
+            )
+
+            mlperf_logger.log_event(key=mlperf_logger.constants.TRAIN_SAMPLES,
+                                    value=train_data.num_samples)
+
+            train_loader = torch.utils.data.DataLoader(
+                train_data,
+                batch_size=None,
+                batch_sampler=None,
+                shuffle=False,
+                num_workers=0,
+                collate_fn=None,
+                pin_memory=False,
+                drop_last=False,
+                sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None
+            )
+
+            test_data = data_loader_terabyte.CriteoBinDataset(
+                data_file=test_file,
+                counts_file=counts_file,
+                batch_size=args.test_mini_batch_size,
+                max_ind_range=args.max_ind_range
+            )
+
+            mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_SAMPLES,
+                                    value=test_data.num_samples)
+
+            test_loader = torch.utils.data.DataLoader(
+                test_data,
+                batch_size=None,
+                batch_sampler=None,
+                shuffle=False,
+                num_workers=0,
+                collate_fn=None,
+                pin_memory=False,
+                drop_last=False,
+            )
+        else:
+            data_filename = args.raw_data_file.split("/")[-1]
+
+            train_data = CriteoDataset(
+                args.data_set,
+                args.max_ind_range,
+                args.data_sub_sample_rate,
+                args.data_randomize,
+                "train",
+                args.raw_data_file,
+                args.processed_data_file,
+                args.memory_map,
+                args.dataset_multiprocessing
+            )
+
+            test_data = CriteoDataset(
+                args.data_set,
+                args.max_ind_range,
+                args.data_sub_sample_rate,
+                args.data_randomize,
+                "test",
+                args.raw_data_file,
+                args.processed_data_file,
+                args.memory_map,
+                args.dataset_multiprocessing
+            )
+
+            train_loader = data_loader_terabyte.DataLoader(
+                data_directory=data_directory,
+                data_filename=data_filename,
+                days=list(range(23)),
+                batch_size=args.mini_batch_size,
+                max_ind_range=args.max_ind_range,
+                split="train"
+            )
+
+            test_loader = data_loader_terabyte.DataLoader(
+                data_directory=data_directory,
+                data_filename=data_filename,
+                days=[23],
+                batch_size=args.test_mini_batch_size,
+                max_ind_range=args.max_ind_range,
+                split="test"
+            )
+    else:
+        train_data = CriteoDataset(
+            args.data_set,
+            args.max_ind_range,
+            args.data_sub_sample_rate,
+            args.data_randomize,
+            "train",
+            args.raw_data_file,
+            args.processed_data_file,
+            args.memory_map,
+            args.dataset_multiprocessing,
+        )
+
+        test_data = CriteoDataset(
+            args.data_set,
+            args.max_ind_range,
+            args.data_sub_sample_rate,
+            args.data_randomize,
+            "test",
+            args.raw_data_file,
+            args.processed_data_file,
+            args.memory_map,
+            args.dataset_multiprocessing,
+        )
+
+        collate_wrapper_criteo = collate_wrapper_criteo_offset
+        if offset_to_length_converter:
+            collate_wrapper_criteo = collate_wrapper_criteo_length
+
+        train_loader = torch.utils.data.DataLoader(
+            train_data,
+            batch_size=args.mini_batch_size,
+            shuffle=False,
+            num_workers=args.num_workers,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+        test_loader = torch.utils.data.DataLoader(
+            test_data,
+            batch_size=args.test_mini_batch_size,
+            shuffle=False,
+            num_workers=args.test_num_workers,
+            collate_fn=collate_wrapper_criteo,
+            pin_memory=False,
+            drop_last=False,  # True
+        )
+
+    return train_data, train_loader, test_data, test_loader
+
+
+# uniform ditribution (input data)
+class RandomDataset(Dataset):
+
+    def __init__(
+            self,
+            m_den,
+            ln_emb,
+            data_size,
+            num_batches,
+            mini_batch_size,
+            num_indices_per_lookup,
+            num_indices_per_lookup_fixed,
+            num_targets=1,
+            round_targets=False,
+            data_generation="random",
+            trace_file="",
+            enable_padding=False,
+            reset_seed_on_access=False,
+            rand_data_dist="uniform",
+            rand_data_min=1,
+            rand_data_max=1,
+            rand_data_mu=-1,
+            rand_data_sigma=1,
+            rand_seed=0,
+            cache_size=None,
+    ):
+        # compute batch size
+        nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
+        if num_batches != 0:
+            nbatches = num_batches
+            data_size = nbatches * mini_batch_size
+            # print("Total number of batches %d" % nbatches)
+
+        # save args (recompute data_size if needed)
+        self.m_den = m_den
+        self.ln_emb = ln_emb
+        self.data_size = data_size
+        self.num_batches = nbatches
+        self.mini_batch_size = mini_batch_size
+        self.num_indices_per_lookup = num_indices_per_lookup
+        self.num_indices_per_lookup_fixed = num_indices_per_lookup_fixed
+        self.num_targets = num_targets
+        self.round_targets = round_targets
+        self.data_generation = data_generation
+        self.trace_file = trace_file
+        self.enable_padding = enable_padding
+        self.reset_seed_on_access = reset_seed_on_access
+        self.rand_seed = rand_seed
+        self.rand_data_dist = rand_data_dist
+        self.rand_data_min = rand_data_min
+        self.rand_data_max = rand_data_max
+        self.rand_data_mu = rand_data_mu
+        self.rand_data_sigma = rand_data_sigma
+        self.cache_size = cache_size
+
+    def reset_numpy_seed(self, numpy_rand_seed):
+        np.random.seed(numpy_rand_seed)
+        # torch.manual_seed(numpy_rand_seed)
+
+    def __getitem__(self, index):
+
+        if isinstance(index, slice):
+            return [
+                self[idx] for idx in range(
+                    index.start or 0, index.stop or len(self), index.step or 1
+                )
+            ]
+
+        # WARNING: reset seed on access to first element
+        # (e.g. if same random samples needed across epochs)
+        if self.reset_seed_on_access and index == 0:
+            self.reset_numpy_seed(self.rand_seed)
+
+        # number of data points in a batch
+        n = min(self.mini_batch_size, self.data_size - (index * self.mini_batch_size))
+
+        # generate a batch of dense and sparse features
+        if self.data_generation == "random":
+            if self.cache_size is None:
+                Gen = generate_dist_input_batch.__wrapped__
+                cache_key = None
+            else:
+                Gen = generate_dist_input_batch
+                cache_key = index % self.cache_size
+            (X, lS_o, lS_i) = Gen(
+                self.m_den,
+                tuple(self.ln_emb.tolist()),
+                n,
+                self.num_indices_per_lookup,
+                self.num_indices_per_lookup_fixed,
+                rand_data_dist=self.rand_data_dist,
+                rand_data_min=self.rand_data_min,
+                rand_data_max=self.rand_data_max,
+                rand_data_mu=self.rand_data_mu,
+                rand_data_sigma=self.rand_data_sigma,
+                cache_key=cache_key,
+            )
+        elif self.data_generation == "synthetic":
+            (X, lS_o, lS_i) = generate_synthetic_input_batch(
+                self.m_den,
+                self.ln_emb,
+                n,
+                self.num_indices_per_lookup,
+                self.num_indices_per_lookup_fixed,
+                self.trace_file,
+                self.enable_padding
+            )
+        else:
+            sys.exit(
+                "ERROR: --data-generation=" + self.data_generation + " is not supported"
+            )
+
+        # generate a batch of target (probability of a click)
+        if 'cache_key' in locals() and cache_key is not None:
+            T = generate_random_output_batch(n, self.num_targets, self.round_targets, cache_key)
+        else:
+            T = generate_random_output_batch.__wrapped__(n, self.num_targets, self.round_targets)
+
+        return (X, lS_o, lS_i, T)
+
+    def __len__(self):
+        # WARNING: note that we produce bacthes of outputs in __getitem__
+        # therefore we should use num_batches rather than data_size below
+        return self.num_batches
+
+
+def collate_wrapper_random_offset(list_of_tuples):
+    # where each tuple is (X, lS_o, lS_i, T)
+    (X, lS_o, lS_i, T) = list_of_tuples[0]
+    return (X,
+            torch.stack(lS_o),
+            lS_i,
+            T)
+
+
+def collate_wrapper_random_length(list_of_tuples):
+    # where each tuple is (X, lS_o, lS_i, T)
+    (X, lS_o, lS_i, T) = list_of_tuples[0]
+    return (X,
+            offset_to_length_converter(torch.stack(lS_o), lS_i),
+            lS_i,
+            T)
+
+
+def make_random_data_and_loader(args, ln_emb, m_den,
+    offset_to_length_converter=False, cache_size=None,
+):
+
+    train_data = RandomDataset(
+        m_den,
+        ln_emb,
+        args.data_size,
+        args.num_batches,
+        args.mini_batch_size,
+        args.num_indices_per_lookup,
+        args.num_indices_per_lookup_fixed,
+        1,  # num_targets
+        args.round_targets,
+        args.data_generation,
+        args.data_trace_file,
+        args.data_trace_enable_padding,
+        reset_seed_on_access=True,
+        rand_data_dist=args.rand_data_dist,
+        rand_data_min=args.rand_data_min,
+        rand_data_max=args.rand_data_max,
+        rand_data_mu=args.rand_data_mu,
+        rand_data_sigma=args.rand_data_sigma,
+        rand_seed=args.numpy_rand_seed,
+        cache_size=cache_size,
+    )  # WARNING: generates a batch of lookups at once
+
+    test_data = RandomDataset(
+        m_den,
+        ln_emb,
+        args.data_size,
+        args.num_batches,
+        args.mini_batch_size,
+        args.num_indices_per_lookup,
+        args.num_indices_per_lookup_fixed,
+        1,  # num_targets
+        args.round_targets,
+        args.data_generation,
+        args.data_trace_file,
+        args.data_trace_enable_padding,
+        reset_seed_on_access=True,
+        rand_data_dist=args.rand_data_dist,
+        rand_data_min=args.rand_data_min,
+        rand_data_max=args.rand_data_max,
+        rand_data_mu=args.rand_data_mu,
+        rand_data_sigma=args.rand_data_sigma,
+        rand_seed=args.numpy_rand_seed,
+        cache_size=cache_size,
+    )
+
+    collate_wrapper_random = collate_wrapper_random_offset
+    if offset_to_length_converter:
+        collate_wrapper_random = collate_wrapper_random_length
+
+    train_loader = torch.utils.data.DataLoader(
+        train_data,
+        batch_size=1,
+        shuffle=False,
+        num_workers=args.num_workers,
+        collate_fn=collate_wrapper_random,
+        pin_memory=False,
+        drop_last=False,  # True
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        test_data,
+        batch_size=1,
+        shuffle=False,
+        num_workers=args.num_workers,
+        collate_fn=collate_wrapper_random,
+        pin_memory=False,
+        drop_last=False,  # True
+    )
+    return train_data, train_loader, test_data, test_loader
+
+
+def generate_random_data(
+    m_den,
+    ln_emb,
+    data_size,
+    num_batches,
+    mini_batch_size,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    num_targets=1,
+    round_targets=False,
+    data_generation="random",
+    trace_file="",
+    enable_padding=False,
+    length=False, # length for caffe2 version (except dlrm_s_caffe2)
+):
+    nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
+    if num_batches != 0:
+        nbatches = num_batches
+        data_size = nbatches * mini_batch_size
+    # print("Total number of batches %d" % nbatches)
+
+    # inputs
+    lT = []
+    lX = []
+    lS_offsets = []
+    lS_indices = []
+    for j in range(0, nbatches):
+        # number of data points in a batch
+        n = min(mini_batch_size, data_size - (j * mini_batch_size))
+
+        # generate a batch of dense and sparse features
+        if data_generation == "random":
+            (Xt, lS_emb_offsets, lS_emb_indices) = generate_uniform_input_batch(
+                m_den,
+                ln_emb,
+                n,
+                num_indices_per_lookup,
+                num_indices_per_lookup_fixed,
+                length,
+            )
+        elif data_generation == "synthetic":
+            (Xt, lS_emb_offsets, lS_emb_indices) = generate_synthetic_input_batch(
+                m_den,
+                ln_emb,
+                n,
+                num_indices_per_lookup,
+                num_indices_per_lookup_fixed,
+                trace_file,
+                enable_padding
+            )
+        else:
+            sys.exit(
+                "ERROR: --data-generation=" + data_generation + " is not supported"
+            )
+        # dense feature
+        lX.append(Xt)
+        # sparse feature (sparse indices)
+        lS_offsets.append(lS_emb_offsets)
+        lS_indices.append(lS_emb_indices)
+
+        # generate a batch of target (probability of a click)
+        P = generate_random_output_batch(n, num_targets, round_targets)
+        lT.append(P)
+
+    return (nbatches, lX, lS_offsets, lS_indices, lT)
+
+
+@functools.lru_cache(maxsize=None)
+def generate_random_output_batch(n, num_targets, round_targets=False, cache_key=None):
+    # target (probability of a click)
+    if round_targets:
+        P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.float32)
+    else:
+        P = ra.rand(n, num_targets).astype(np.float32)
+
+    return torch.tensor(P)
+
+
+# uniform ditribution (input data)
+def generate_uniform_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    length,
+):
+    # dense feature
+    Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+
+    # sparse feature (sparse indices)
+    lS_emb_offsets = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for size in ln_emb:
+        lS_batch_offsets = []
+        lS_batch_indices = []
+        offset = 0
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int64(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int64(
+                    np.round(max([1.0], r * min(size, num_indices_per_lookup)))
+                )
+            # sparse indices to be used per embedding
+            r = ra.random(sparse_group_size)
+            sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int32(sparse_group.size)
+            # store lengths and indices
+            if length: # for caffe2 version
+                lS_batch_offsets += [sparse_group_size]
+            else:
+                lS_batch_offsets += [offset]
+            lS_batch_indices += sparse_group.tolist()
+            # update offset for next iteration
+            offset += sparse_group_size
+        lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
+        lS_emb_indices.append(torch.tensor(lS_batch_indices))
+
+    return (Xt, lS_emb_offsets, lS_emb_indices)
+
+
+# random data from uniform or gaussian ditribution (input data)
+@functools.lru_cache(maxsize=None)
+def generate_dist_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    rand_data_dist,
+    rand_data_min,
+    rand_data_max,
+    rand_data_mu,
+    rand_data_sigma,
+    cache_key = None,
+):
+    # dense feature
+    Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+
+    # sparse feature (sparse indices)
+    lS_emb_offsets = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for size in ln_emb:
+        lS_batch_offsets = []
+        lS_batch_indices = []
+        offset = 0
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int64(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int64(
+                    np.round(max([1.0], r * min(size, num_indices_per_lookup)))
+                )
+            # sparse indices to be used per embedding
+            if rand_data_dist == "gaussian":
+                if rand_data_mu == -1:
+                    rand_data_mu = (rand_data_max + rand_data_min) / 2.0
+                r = ra.normal(rand_data_mu, rand_data_sigma, sparse_group_size)
+                sparse_group = np.clip(r, rand_data_min, rand_data_max)
+                sparse_group = np.unique(sparse_group).astype(np.int64)
+            elif rand_data_dist == "uniform":
+                r = ra.random(sparse_group_size)
+                sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
+            else:
+                raise(rand_data_dist, "distribution is not supported. \
+                     please select uniform or gaussian")
+
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int64(sparse_group.size)
+            # store lengths and indices
+            lS_batch_offsets += [offset]
+            lS_batch_indices += sparse_group.tolist()
+            # update offset for next iteration
+            offset += sparse_group_size
+        lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
+        lS_emb_indices.append(torch.tensor(lS_batch_indices))
+
+    return (Xt, lS_emb_offsets, lS_emb_indices)
+
+
+# synthetic distribution (input data)
+def generate_synthetic_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    trace_file,
+    enable_padding=False,
+):
+    # dense feature
+    Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+
+    # sparse feature (sparse indices)
+    lS_emb_offsets = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for i, size in enumerate(ln_emb):
+        lS_batch_offsets = []
+        lS_batch_indices = []
+        offset = 0
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int64(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int64(
+                    max(1, np.round(r * min(size, num_indices_per_lookup))[0])
+                )
+            # sparse indices to be used per embedding
+            file_path = trace_file
+            line_accesses, list_sd, cumm_sd = read_dist_from_file(
+                file_path.replace("j", str(i))
+            )
+            # debug prints
+            # print("input")
+            # print(line_accesses); print(list_sd); print(cumm_sd);
+            # print(sparse_group_size)
+            # approach 1: rand
+            # r = trace_generate_rand(
+            #     line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
+            # )
+            # approach 2: lru
+            r = trace_generate_lru(
+                line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
+            )
+            # WARNING: if the distribution in the file is not consistent
+            # with embedding table dimensions, below mod guards against out
+            # of range access
+            sparse_group = np.unique(r).astype(np.int64)
+            minsg = np.min(sparse_group)
+            maxsg = np.max(sparse_group)
+            if (minsg < 0) or (size <= maxsg):
+                print(
+                    "WARNING: distribution is inconsistent with embedding "
+                    + "table size (using mod to recover and continue)"
+                )
+                sparse_group = np.mod(sparse_group, size).astype(np.int64)
+            # sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int64))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int64(sparse_group.size)
+            # store lengths and indices
+            lS_batch_offsets += [offset]
+            lS_batch_indices += sparse_group.tolist()
+            # update offset for next iteration
+            offset += sparse_group_size
+        lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
+        lS_emb_indices.append(torch.tensor(lS_batch_indices))
+
+    return (Xt, lS_emb_offsets, lS_emb_indices)
+
+
+def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False):
+    u = ra.rand(1)
+    if i < max_i:
+        # only generate stack distances up to the number of new references seen so far
+        j = bisect.bisect(cumm_val, i) - 1
+        fi = cumm_dist[j]
+        u *= fi  # shrink distribution support to exclude last values
+    elif enable_padding:
+        # WARNING: disable generation of new references (once all have been seen)
+        fi = cumm_dist[0]
+        u = (1.0 - fi) * u + fi  # remap distribution support to exclude first value
+
+    for (j, f) in enumerate(cumm_dist):
+        if u <= f:
+            return cumm_val[j]
+
+
+# WARNING: global define, must be consistent across all synthetic functions
+cache_line_size = 1
+
+
+def trace_generate_lru(
+    line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
+):
+    max_sd = list_sd[-1]
+    l = len(line_accesses)
+    i = 0
+    ztrace = deque()
+    for _ in range(out_trace_len):
+        sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
+        mem_ref_within_line = 0  # floor(ra.rand(1)*cache_line_size) #0
+
+        # generate memory reference
+        if sd == 0:  # new reference #
+            line_ref = line_accesses[0]
+            del line_accesses[0]
+            line_accesses.append(line_ref)
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            i += 1
+        else:  # existing reference #
+            line_ref = line_accesses[l - sd]
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            del line_accesses[l - sd]
+            line_accesses.append(line_ref)
+        # save generated memory reference
+        ztrace.append(mem_ref)
+
+    return ztrace
+
+
+def trace_generate_rand(
+    line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
+):
+    max_sd = list_sd[-1]
+    l = len(line_accesses)  # !!!Unique,
+    i = 0
+    ztrace = []
+    for _ in range(out_trace_len):
+        sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
+        mem_ref_within_line = 0  # floor(ra.rand(1)*cache_line_size) #0
+        # generate memory reference
+        if sd == 0:  # new reference #
+            line_ref = line_accesses.pop(0)
+            line_accesses.append(line_ref)
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+            i += 1
+        else:  # existing reference #
+            line_ref = line_accesses[l - sd]
+            mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
+        ztrace.append(mem_ref)
+
+    return ztrace
+
+
+def trace_profile(trace, enable_padding=False):
+    # number of elements in the array (assuming 1D)
+    # n = trace.size
+
+    rstack = deque()  # S
+    stack_distances = deque()  # SDS
+    line_accesses = deque()  # L
+    for x in trace:
+        r = np.uint64(x / cache_line_size)
+        l = len(rstack)
+        try:  # found #
+            i = rstack.index(r)
+            # WARNING: I believe below is the correct depth in terms of meaning of the
+            #          algorithm, but that is not what seems to be in the paper alg.
+            #          -1 can be subtracted if we defined the distance between
+            #          consecutive accesses (e.g. r, r) as 0 rather than 1.
+            sd = l - i  # - 1
+            # push r to the end of stack_distances
+            stack_distances.appendleft(sd)
+            # remove r from its position and insert to the top of stack
+            del rstack[i]  # rstack.remove(r)
+            rstack.append(r)
+        except ValueError:  # not found #
+            sd = 0  # -1
+            # push r to the end of stack_distances/line_accesses
+            stack_distances.appendleft(sd)
+            line_accesses.appendleft(r)
+            # push r to the top of stack
+            rstack.append(r)
+
+    if enable_padding:
+        # WARNING: notice that as the ratio between the number of samples (l)
+        # and cardinality (c) of a sample increases the probability of
+        # generating a sample gets smaller and smaller because there are
+        # few new samples compared to repeated samples. This means that for a
+        # long trace with relatively small cardinality it will take longer to
+        # generate all new samples and therefore obtain full distribution support
+        # and hence it takes longer for distribution to resemble the original.
+        # Therefore, we may pad the number of new samples to be on par with
+        # average number of samples l/c artificially.
+        l = len(stack_distances)
+        c = max(stack_distances)
+        padding = int(np.ceil(l / c))
+        stack_distances = stack_distances + [0] * padding
+
+    return (rstack, stack_distances, line_accesses)
+
+
+# auxiliary read/write routines
+def read_trace_from_file(file_path):
+    try:
+        with open(file_path) as f:
+            if args.trace_file_binary_type:
+                array = np.fromfile(f, dtype=np.uint64)
+                trace = array.astype(np.uint64).tolist()
+            else:
+                line = f.readline()
+                trace = list(map(lambda x: np.uint64(x), line.split(", ")))
+            return trace
+    except Exception:
+        print(f"ERROR: trace file '{file_path}' is not available.")
+
+
+def write_trace_to_file(file_path, trace):
+    try:
+        if args.trace_file_binary_type:
+            with open(file_path, "wb+") as f:
+                np.array(trace).astype(np.uint64).tofile(f)
+        else:
+            with open(file_path, "w+") as f:
+                s = str(list(trace))
+                f.write(s[1 : len(s) - 1])
+    except Exception:
+        print("ERROR: no output trace file has been provided")
+
+
+def read_dist_from_file(file_path):
+    try:
+        with open(file_path, "r") as f:
+            lines = f.read().splitlines()
+    except Exception:
+        print("{file_path} Wrong file or file path")
+    # read unique accesses
+    unique_accesses = [int(el) for el in lines[0].split(", ")]
+    # read cumulative distribution (elements are passed as two separate lists)
+    list_sd = [int(el) for el in lines[1].split(", ")]
+    cumm_sd = [float(el) for el in lines[2].split(", ")]
+
+    return unique_accesses, list_sd, cumm_sd
+
+
+def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd):
+    try:
+        with open(file_path, "w") as f:
+            # unique_acesses
+            s = str(list(unique_accesses))
+            f.write(s[1 : len(s) - 1] + "\n")
+            # list_sd
+            s = str(list_sd)
+            f.write(s[1 : len(s) - 1] + "\n")
+            # cumm_sd
+            s = str(list(cumm_sd))
+            f.write(s[1 : len(s) - 1] + "\n")
+    except Exception:
+        print("Wrong file or file path")
+
+
+if __name__ == "__main__":
+    import operator
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(description="Generate Synthetic Distributions")
+    parser.add_argument("--trace-file", type=str, default="./input/trace.log")
+    parser.add_argument("--trace-file-binary-type", type=bool, default=False)
+    parser.add_argument("--trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--dist-file", type=str, default="./input/dist.log")
+    parser.add_argument(
+        "--synthetic-file", type=str, default="./input/trace_synthetic.log"
+    )
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--print-precision", type=int, default=5)
+    args = parser.parse_args()
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+
+    ### read trace ###
+    trace = read_trace_from_file(args.trace_file)
+    # print(trace)
+
+    ### profile trace ###
+    (_, stack_distances, line_accesses) = trace_profile(
+        trace, args.trace_enable_padding
+    )
+    stack_distances.reverse()
+    line_accesses.reverse()
+    # print(line_accesses)
+    # print(stack_distances)
+
+    ### compute probability distribution ###
+    # count items
+    l = len(stack_distances)
+    dc = sorted(
+        collections.Counter(stack_distances).items(), key=operator.itemgetter(0)
+    )
+
+    # create a distribution
+    list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc))  # x = tuple_x_k[0]
+    dist_sd = list(
+        map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc)
+    )  # k = tuple_x_k[1]
+    cumm_sd = deque()  # np.cumsum(dc).tolist() #prefixsum
+    for i, (_, k) in enumerate(dc):
+        if i == 0:
+            cumm_sd.append(k / float(l))
+        else:
+            # add the 2nd element of the i-th tuple in the dist_sd list
+            cumm_sd.append(cumm_sd[i - 1] + (k / float(l)))
+
+    ### write stack_distance and line_accesses to a file ###
+    write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd)
+
+    ### generate corresponding synthetic ###
+    # line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file)
+    synthetic_trace = trace_generate_lru(
+        line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
+    )
+    # synthetic_trace = trace_generate_rand(
+    #     line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
+    # )
+    write_trace_to_file(args.synthetic_file, synthetic_trace)
diff --git a/benchmarks/dlrm/ootb/dlrm_s_caffe2.py b/benchmarks/dlrm/ootb/dlrm_s_caffe2.py
new file mode 100644
index 0000000..8e3ed74
--- /dev/null
+++ b/benchmarks/dlrm/ootb/dlrm_s_caffe2.py
@@ -0,0 +1,1703 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: an implementation of a deep learning recommendation model (DLRM)
+# The model input consists of dense and sparse features. The former is a vector
+# of floating point values. The latter is a list of sparse indices into
+# embedding tables, which consist of vectors of floating point values.
+# The selected vectors are passed to mlp networks denoted by triangles,
+# in some cases the vectors are interacted through operators (Ops).
+#
+# output:
+#                         vector of values
+# model:                        |
+#                              /\
+#                             /__\
+#                               |
+#       _____________________> Op  <___________________
+#     /                         |                      \
+#    /\                        /\                      /\
+#   /__\                      /__\           ...      /__\
+#    |                          |                       |
+#    |                         Op                      Op
+#    |                    ____/__\_____           ____/__\____
+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+# input:
+# [ dense features ]     [sparse indices] , ..., [sparse indices]
+#
+# More precise definition of model layers:
+# 1) fully connected layers of an mlp
+# z = f(y)
+# y = Wx + b
+#
+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+# z = Op(e1,...,ek)
+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+#
+# 3) Operator Op can be one of the following
+# Sum(e1,...,ek) = e1 + ... + ek
+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+# Cat(e1,...,ek) = [e1', ..., ek']'
+# where ' denotes transpose operation
+#
+# References:
+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import functools
+
+# others
+import operator
+import time
+import copy
+
+# data generation
+import dlrm_data_pytorch as dp
+
+# numpy
+import numpy as np
+import sklearn.metrics
+
+# onnx
+# The onnx import causes deprecation warnings every time workers
+# are spawned during testing. So, we filter out those warnings.
+import warnings
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    try:
+        import onnx
+        import caffe2.python.onnx.frontend
+    except ImportError as error:
+        print('Unable to import onnx or caffe2.python.onnx.frontend ', error)
+
+# from caffe2.python import data_parallel_model
+
+# caffe2
+from caffe2.proto import caffe2_pb2
+from caffe2.python import brew, core, dyndep, model_helper, net_drawer, workspace
+
+"""
+# auxiliary routine used to split input on the mini-bacth dimension
+def where_to_split(mini_batch_size, ndevices, _add_leftover=False):
+    n = (mini_batch_size + ndevices - 1) // ndevices  # ceiling
+    l = mini_batch_size - n * (ndevices - 1)  # leftover
+    s = [n] * (ndevices - 1)
+    if _add_leftover:
+        ls += [l if l > 0 else n]
+    return ls
+"""
+
+
+### define dlrm in Caffe2 ###
+class DLRM_Net(object):
+    def FeedBlobWrapper(self, tag, val, add_prefix=True, split=False, device_id=-1):
+        if self.ndevices > 1 and add_prefix:
+            if split:
+                # split across devices
+                mini_batch_size = val.shape[0]
+                # approach 1: np and caffe2 operators assume the mini-batch size is
+                # divisible exactly by the number of available devices
+                if mini_batch_size % self.ndevices != 0:
+                    sys.exit("ERROR: caffe2 net assumes that the mini_batch_size "
+                             + str(mini_batch_size)
+                             + " is evenly divisible by the number of available devices"
+                             + str(self.ndevices))
+                vals = np.split(val, self.ndevices, axis=0)
+                """
+                # approach 2: np and caffe2 operators do not assume exact divisibility
+                if args.mini_batch_size != mini_batch_size:
+                    sys.exit("ERROR: caffe2 net was prepared for mini-batch size "
+                             + str(args.mini_batch_size)
+                             + " which is different from current mini-batch size "
+                             + str(mini_batch_size) + " being passed to it. "
+                             + "This is common for the last mini-batch, when "
+                             + "mini-batch size does not evenly divided the number of "
+                             + "elements in the data set.")
+                ls = where_to_split(mini_batch_size, self.ndevices)
+                vals = np.split(val, ls, axis=0)
+                """
+                # feed to multiple devices
+                for d in range(self.ndevices):
+                    tag_on_device = "gpu_" + str(d) + "/" + tag
+                    _d = core.DeviceOption(workspace.GpuDeviceType, d)
+                    workspace.FeedBlob(tag_on_device, vals[d], device_option=_d)
+            else:
+                # feed to multiple devices
+                for d in range(self.ndevices):
+                    tag_on_device = "gpu_" + str(d) + "/" + tag
+                    _d = core.DeviceOption(workspace.GpuDeviceType, d)
+                    workspace.FeedBlob(tag_on_device, val, device_option=_d)
+        else:
+            # feed to a single device (named or not)
+            if device_id >= 0:
+                _d = core.DeviceOption(workspace.GpuDeviceType, device_id)
+                workspace.FeedBlob(tag, val, device_option=_d)
+            else:
+                workspace.FeedBlob(tag, val)
+
+    def FetchBlobWrapper(self, tag, add_prefix=True, reduce_across=None, device_id=-1):
+        if self.ndevices > 1 and add_prefix:
+            # fetch from multiple devices
+            vals = []
+            for d in range(self.ndevices):
+                if tag.__class__ == list:
+                    tag_on_device = tag[d]
+                else:
+                    tag_on_device = "gpu_" + str(0) + "/" + tag
+                val = workspace.FetchBlob(tag_on_device)
+                vals.append(val)
+            # reduce across devices
+            if reduce_across == "add":
+                return functools.reduce(operator.add, vals)
+            elif reduce_across == "concat":
+                return np.concatenate(vals)
+            else:
+                return vals
+        else:
+            # fetch from a single device (named or not)
+            if device_id >= 0:
+                tag_on_device = "gpu_" + str(device_id) + "/" + tag
+                return workspace.FetchBlob(tag_on_device)
+            else:
+                return workspace.FetchBlob(tag)
+
+    def AddLayerWrapper(self, layer, inp_blobs, out_blobs,
+                        add_prefix=True, reset_grad=False, **kwargs):
+        # auxiliary routine to adjust tags
+        def adjust_tag(blobs, on_device):
+            if blobs.__class__ == str:
+                _blobs = on_device + blobs
+            elif blobs.__class__ == list:
+                _blobs = list(map(lambda tag: on_device + tag, blobs))
+            else:  # blobs.__class__ == model_helper.ModelHelper or something else
+                _blobs = blobs
+            return _blobs
+
+        if self.ndevices > 1 and add_prefix:
+            # add layer on multiple devices
+            ll = []
+            for d in range(self.ndevices):
+                # add prefix on_device
+                on_device = "gpu_" + str(d) + "/"
+                _inp_blobs = adjust_tag(inp_blobs, on_device)
+                _out_blobs = adjust_tag(out_blobs, on_device)
+                # WARNING: reset_grad option was exlusively designed for WeightedSum
+                #         with inp_blobs=[w, tag_one, "", lr], where "" will be replaced
+                if reset_grad:
+                    w_grad = self.gradientMap[_inp_blobs[0]]
+                    _inp_blobs[2] = w_grad
+                # add layer to the model
+                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+                    if kwargs:
+                        new_layer = layer(_inp_blobs, _out_blobs, **kwargs)
+                    else:
+                        new_layer = layer(_inp_blobs, _out_blobs)
+                ll.append(new_layer)
+            return ll
+        else:
+            # add layer on a single device
+            # WARNING: reset_grad option was exlusively designed for WeightedSum
+            #          with inp_blobs=[w, tag_one, "", lr], where "" will be replaced
+            if reset_grad:
+                w_grad = self.gradientMap[inp_blobs[0]]
+                inp_blobs[2] = w_grad
+            # add layer to the model
+            if kwargs:
+                new_layer = layer(inp_blobs, out_blobs, **kwargs)
+            else:
+                new_layer = layer(inp_blobs, out_blobs)
+            return new_layer
+
+    def create_mlp(self, ln, sigmoid_layer, model, tag):
+        (tag_layer, tag_in, tag_out) = tag
+
+        # build MLP layer by layer
+        layers = []
+        weights = []
+        for i in range(1, ln.size):
+            n = ln[i - 1]
+            m = ln[i]
+
+            # create tags
+            tag_fc_w = tag_layer + ":::" + "fc" + str(i) + "_w"
+            tag_fc_b = tag_layer + ":::" + "fc" + str(i) + "_b"
+            tag_fc_y = tag_layer + ":::" + "fc" + str(i) + "_y"
+            tag_fc_z = tag_layer + ":::" + "fc" + str(i) + "_z"
+            if i == ln.size - 1:
+                tag_fc_z = tag_out
+            weights.append(tag_fc_w)
+            weights.append(tag_fc_b)
+
+            # initialize the weights
+            # approach 1: custom Xavier input, output or two-sided fill
+            mean = 0.0  # std_dev = np.sqrt(variance)
+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+            b = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+            self.FeedBlobWrapper(tag_fc_w, W)
+            self.FeedBlobWrapper(tag_fc_b, b)
+            # approach 2: caffe2 xavier
+            # W = self.AddLayerWrapper(
+            #     model.param_init_net.XavierFill,
+            #     [],
+            #     tag_fc_w,
+            #     shape=[m, n]
+            # )
+            # b = self.AddLayerWrapper(
+            #     model.param_init_net.ConstantFill,
+            #     [],
+            #     tag_fc_b,
+            #     shape=[m]
+            # )
+
+            # initialize the MLP's momentum for the Adagrad optimizer
+            if self.emb_optimizer in ["adagrad", "rwsadagrad"]:
+                # momentum of the weights
+                self.FeedBlobWrapper(
+                    "momentum_mlp_{}_{}".format(tag_layer, 2 * i - 1),
+                    np.full((m, n), 0, dtype=np.float32)
+                )
+                # momentum of the biases
+                self.FeedBlobWrapper(
+                    "momentum_mlp_{}_{}".format(tag_layer, 2 * i),
+                    np.full((m), 0, dtype=np.float32)
+                )
+
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                self.onnx_tsd[tag_fc_w] = (onnx.TensorProto.FLOAT, W.shape)
+                self.onnx_tsd[tag_fc_b] = (onnx.TensorProto.FLOAT, b.shape)
+
+            # approach 1: construct fully connected operator using model.net
+            fc = self.AddLayerWrapper(
+                model.net.FC, [tag_in, tag_fc_w, tag_fc_b], tag_fc_y
+            )
+            # approach 2: construct fully connected operator using brew
+            # https://github.com/caffe2/tutorials/blob/master/MNIST.ipynb
+            # fc = brew.fc(model, layer, tag_fc_w, dim_in=m, dim_out=n)
+            layers.append(fc)
+
+            if i == sigmoid_layer:
+                # approach 1: construct sigmoid operator using model.net
+                layer = self.AddLayerWrapper(model.net.Sigmoid, tag_fc_y, tag_fc_z)
+                # approach 2: using brew (which currently does not support sigmoid)
+                # tag_sigm = tag_layer + ":::" + "sigmoid" + str(i)
+                # layer = brew.sigmoid(model,fc,tag_sigmoid)
+            else:
+                # approach 1: construct relu operator using model.net
+                layer = self.AddLayerWrapper(model.net.Relu, tag_fc_y, tag_fc_z)
+                # approach 2: using brew
+                # tag_relu = tag_layer + ":::" + "relu" + str(i)
+                # layer = brew.relu(model,fc,tag_relu)
+            tag_in = tag_fc_z
+            layers.append(layer)
+
+        # WARNING: the dependency between layers is implicit in the tags,
+        # so only the last layer is added to the layers list. It will
+        # later be used for interactions.
+        return layers, weights
+
+    def create_emb(self, m, ln, model, tag):
+        (tag_layer, tag_in, tag_out) = tag
+        emb_l = []
+        weights_l = []
+        vw_l = []
+        for i in range(0, ln.size):
+            n = ln[i]
+
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            else:
+                d = -1
+
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            len_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_l"
+            ind_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_i"
+            tbl_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_w"
+            sum_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_z"
+            weights_l.append(tbl_s)
+
+            # initialize the weights
+            # approach 1a: custom
+            W = np.random.uniform(low=-np.sqrt(1 / n),
+                                  high=np.sqrt(1 / n),
+                                  size=(n, m)).astype(np.float32)
+            # approach 1b: numpy rand
+            # W = ra.rand(n, m).astype(np.float32)
+            self.FeedBlobWrapper(tbl_s, W, False, device_id=d)
+            # approach 2: caffe2 xavier
+            # with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+            #     W = model.param_init_net.XavierFill([], tbl_s, shape=[n, m])
+            # save the blob shapes for latter (only needed if onnx is requested)
+
+            # initialize the embedding's momentum for the Adagrad optimizer
+            if self.emb_optimizer == "adagrad":
+                self.FeedBlobWrapper("momentum_emb_{}".format(i),
+                    np.full((n, m), 0), add_prefix=False, device_id=d)
+            elif self.emb_optimizer == "rwsadagrad":
+                self.FeedBlobWrapper("momentum_emb_{}".format(i),
+                    np.full((n), 0), add_prefix=False, device_id=d)
+
+            if self.save_onnx:
+                self.onnx_tsd[tbl_s] = (onnx.TensorProto.FLOAT, W.shape)
+
+            # create operator
+            if self.weighted_pooling is not None:
+                vw_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_v"
+                psw_s = on_device + tag_layer + ":::" + "sls" + str(i) + "_s"
+                VW = np.ones(n).astype(np.float32)
+                self.FeedBlobWrapper(vw_s, VW, False, device_id=d)
+                if self.weighted_pooling == "learned":
+                    vw_l.append(vw_s)
+                    grad_on_weights = True
+                else:
+                    grad_on_weights = False
+                if self.save_onnx:
+                    self.onnx_tsd[vw_s] = (onnx.TensorProto.FLOAT, VW.shape)
+                if self.ndevices <= 1:
+                    PSW = model.net.Gather([vw_s, ind_s], [psw_s])
+                    EE = model.net.SparseLengthsWeightedSum(
+                        [tbl_s, PSW, ind_s, len_s], [sum_s],
+                        grad_on_weights=grad_on_weights
+                    )
+                else:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, d)
+                    ):
+                        PSW = model.net.Gather([vw_s, ind_s], [psw_s])
+                        EE = model.net.SparseLengthsWeightedSum(
+                            [tbl_s, PSW, ind_s, len_s], [sum_s],
+                            grad_on_weights=grad_on_weights
+                        )
+            else:
+                if self.ndevices <= 1:
+                    EE = model.net.SparseLengthsSum(
+                        [tbl_s, ind_s, len_s], [sum_s]
+                    )
+                else:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, d)
+                    ):
+                        EE = model.net.SparseLengthsSum(
+                            [tbl_s, ind_s, len_s], [sum_s]
+                        )
+            emb_l.append(EE)
+
+        return emb_l, weights_l, vw_l
+
+    def create_interactions(self, x, ly, model, tag):
+        (tag_dense_in, tag_sparse_in, tag_int_out) = tag
+
+        if self.arch_interaction_op == "dot":
+            # concatenate dense and sparse features
+            tag_int_out_info = tag_int_out + "_info"
+            T, T_info = model.net.Concat(
+                x + ly,
+                [tag_int_out + "_cat_axis0", tag_int_out_info + "_cat_axis0"],
+                axis=1,
+                add_axis=1,
+            )
+            # perform a dot product
+            Z = model.net.BatchMatMul([T, T], tag_int_out + "_matmul", trans_b=1)
+            # append dense feature with the interactions (into a row vector)
+            # approach 1: all
+            # Zflat = model.net.Flatten(Z, tag_int_out + "_flatten", axis=1)
+            # approach 2: unique
+            Zflat_all = model.net.Flatten(Z, tag_int_out + "_flatten_all", axis=1)
+            Zflat = model.net.BatchGather(
+                [Zflat_all, tag_int_out + "_tril_indices"],
+                tag_int_out + "_flatten"
+            )
+            R, R_info = model.net.Concat(
+                x + [Zflat], [tag_int_out, tag_int_out_info], axis=1
+            )
+        elif self.arch_interaction_op == "cat":
+            # concatenation features (into a row vector)
+            tag_int_out_info = tag_int_out + "_info"
+            R, R_info = model.net.Concat(
+                x + ly, [tag_int_out, tag_int_out_info], axis=1
+            )
+        else:
+            sys.exit("ERROR: --arch-interaction-op="
+                     + self.arch_interaction_op + " is not supported")
+
+        return R
+
+    def create_sequential_forward_ops(self):
+        # embeddings
+        tag = (self.temb, self.tsin, self.tsout)
+        self.emb_l, self.emb_w, self.emb_vw = self.create_emb(
+            self.m_spa, self.ln_emb, self.model, tag
+        )
+        # bottom mlp
+        tag = (self.tbot, self.tdin, self.tdout)
+        self.bot_l, self.bot_w = self.create_mlp(self.ln_bot, self.sigmoid_bot,
+                                                 self.model, tag)
+        # interactions
+        tag = (self.tdout, self.tsout, self.tint)
+        Z = self.create_interactions([self.bot_l[-1]], self.emb_l, self.model, tag)
+
+        # top mlp
+        tag = (self.ttop, Z, self.tout)
+        self.top_l, self.top_w = self.create_mlp(self.ln_top, self.sigmoid_top,
+                                                 self.model, tag)
+        # debug prints
+        # print(self.emb_l)
+        # print(self.bot_l)
+        # print(self.top_l)
+
+        # setup the last output variable
+        self.last_output = self.top_l[-1]
+
+    def create_parallel_forward_ops(self):
+        # distribute embeddings (model parallelism)
+        tag = (self.temb, self.tsin, self.tsout)
+        self.emb_l, self.emb_w, self.emb_vw = self.create_emb(
+            self.m_spa, self.ln_emb, self.model, tag
+        )
+        # replicate mlp (data parallelism)
+        tag = (self.tbot, self.tdin, self.tdout)
+        self.bot_l, self.bot_w = self.create_mlp(self.ln_bot, self.sigmoid_bot,
+                                                 self.model, tag)
+
+        # add communication (butterfly shuffle)
+        t_list = []
+        for i, emb_output in enumerate(self.emb_l):
+            # split input
+            src_d = i % self.ndevices
+            lo = [emb_output + "_split_" + str(d) for d in range(self.ndevices)]
+            # approach 1: np and caffe2 operators assume the mini-batch size is
+            # divisible exactly by the number of available devices
+            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, src_d)):
+                self.model.net.Split(emb_output, lo, axis=0)
+            """
+            # approach 2: np and caffe2 operators do not assume exact divisibility
+            ls = where_to_split(args.mini_batch_size, self.ndevices, _add_leftover=True)
+            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, src_d)):
+                emb_output_split = self.model.net.Split(
+                    emb_output, lo, split=lp, axis=0
+                )
+            """
+            # scatter
+            y = []
+            for dst_d in range(len(lo)):
+                src_blob = lo[dst_d]
+                dst_blob = str(src_blob).replace(
+                    "gpu_" + str(src_d), "gpu_" + str(dst_d), 1
+                )
+                if src_blob != dst_blob:
+                    with core.DeviceScope(
+                            core.DeviceOption(workspace.GpuDeviceType, dst_d)
+                    ):
+                        blob = self.model.Copy(src_blob, dst_blob)
+                else:
+                    blob = dst_blob
+                y.append(blob)
+            t_list.append(y)
+        # adjust lists to be ordered per device
+        x = list(map(lambda x: list(x), zip(*self.bot_l)))
+        ly = list(map(lambda y: list(y), zip(*t_list)))
+
+        # interactions
+        for d in range(self.ndevices):
+            on_device = "gpu_" + str(d) + "/"
+            tag = (on_device + self.tdout, on_device + self.tsout, on_device + self.tint)
+            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+                self.create_interactions([x[d][-1]], ly[d], self.model, tag)
+
+        # replicate mlp (data parallelism)
+        tag = (self.ttop, self.tint, self.tout)
+        self.top_l, self.top_w = self.create_mlp(self.ln_top, self.sigmoid_top,
+                                                 self.model, tag)
+
+        # debug prints
+        # print(self.model.net.Proto(),end='\n')
+        # sys.exit("ERROR: debugging")
+
+        # setup the last output variable
+        self.last_output = self.top_l[-1]
+
+    def __init__(
+        self,
+        m_spa,
+        ln_emb,
+        ln_bot,
+        ln_top,
+        arch_interaction_op,
+        arch_interaction_itself=False,
+        sigmoid_bot=-1,
+        sigmoid_top=-1,
+        save_onnx=False,
+        model=None,
+        test_net=None,
+        tag=None,
+        ndevices=-1,
+        forward_ops=True,
+        enable_prof=False,
+        weighted_pooling=None,
+        emb_optimizer="sgd"
+    ):
+        super(DLRM_Net, self).__init__()
+
+        # init model
+        if model is None:
+            global_init_opt = ["caffe2", "--caffe2_log_level=0"]
+            if enable_prof:
+                global_init_opt += [
+                    "--logtostderr=0",
+                    "--log_dir=$HOME",
+                    "--caffe2_logging_print_net_summary=1",
+                ]
+            workspace.GlobalInit(global_init_opt)
+            self.set_tags()
+            self.model = model_helper.ModelHelper(name="DLRM", init_params=True)
+            self.test_net = None
+        else:
+            # WARNING: assume that workspace and tags have been initialized elsewhere
+            self.set_tags(tag[0], tag[1], tag[2], tag[3], tag[4], tag[5], tag[6],
+                          tag[7], tag[8], tag[9])
+            self.model = model
+            self.test_net = test_net
+
+        # save arguments
+        self.m_spa = m_spa
+        self.ln_emb = ln_emb
+        self.ln_bot = ln_bot
+        self.ln_top = ln_top
+        self.arch_interaction_op = arch_interaction_op
+        self.arch_interaction_itself = arch_interaction_itself
+        self.sigmoid_bot = sigmoid_bot
+        self.sigmoid_top = sigmoid_top
+        self.save_onnx = save_onnx
+        self.ndevices = ndevices
+        self.emb_optimizer = emb_optimizer
+        if weighted_pooling is not None and weighted_pooling != "fixed":
+            self.weighted_pooling = "learned"
+        else:
+            self.weighted_pooling = weighted_pooling
+        # onnx types and shapes dictionary
+        if self.save_onnx:
+            self.onnx_tsd = {}
+        # create forward operators
+        if forward_ops:
+            if self.ndevices <= 1:
+                return self.create_sequential_forward_ops()
+            else:
+                return self.create_parallel_forward_ops()
+
+    def set_tags(
+        self,
+        _tag_layer_top_mlp="top",
+        _tag_layer_bot_mlp="bot",
+        _tag_layer_embedding="emb",
+        _tag_feature_dense_in="dense_in",
+        _tag_feature_dense_out="dense_out",
+        _tag_feature_sparse_in="sparse_in",
+        _tag_feature_sparse_out="sparse_out",
+        _tag_interaction="interaction",
+        _tag_dense_output="prob_click",
+        _tag_dense_target="target",
+    ):
+        # layer tags
+        self.ttop = _tag_layer_top_mlp
+        self.tbot = _tag_layer_bot_mlp
+        self.temb = _tag_layer_embedding
+        # dense feature tags
+        self.tdin = _tag_feature_dense_in
+        self.tdout = _tag_feature_dense_out
+        # sparse feature tags
+        self.tsin = _tag_feature_sparse_in
+        self.tsout = _tag_feature_sparse_out
+        # output and target tags
+        self.tint = _tag_interaction
+        self.ttar = _tag_dense_target
+        self.tout = _tag_dense_output
+
+    def parameters(self):
+        return self.model
+
+    def get_loss(self):
+        return self.FetchBlobWrapper(self.loss, reduce_across="add")
+
+    def get_output(self):
+        return self.FetchBlobWrapper(self.last_output, reduce_across="concat")
+
+    def create(self, X, S_lengths, S_indices, T):
+        self.create_input(X, S_lengths, S_indices, T)
+        self.create_model(X, S_lengths, S_indices, T)
+
+    def create_input(self, X, S_lengths, S_indices, T):
+        # feed input data to blobs
+        self.FeedBlobWrapper(self.tdin, X, split=True)
+        # save the blob shapes for latter (only needed if onnx is requested)
+        if self.save_onnx:
+            self.onnx_tsd[self.tdin] = (onnx.TensorProto.FLOAT, X.shape)
+
+        for i in range(len(self.emb_l)):
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            else:
+                d = -1
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            len_s = on_device + self.temb + ":::" + "sls" + str(i) + "_l"
+            ind_s = on_device + self.temb + ":::" + "sls" + str(i) + "_i"
+            self.FeedBlobWrapper(len_s, np.array(S_lengths[i]), False, device_id=d)
+            self.FeedBlobWrapper(ind_s, np.array(S_indices[i]), False, device_id=d)
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                lshape = (len(S_lengths[i]),)  # =args.mini_batch_size
+                ishape = (len(S_indices[i]),)
+                self.onnx_tsd[len_s] = (onnx.TensorProto.INT32, lshape)
+                self.onnx_tsd[ind_s] = (onnx.TensorProto.INT32, ishape)
+
+        # feed target data to blobs
+        if T is not None:
+            zeros_fp32 = np.zeros(T.shape).astype(np.float32)
+            self.FeedBlobWrapper(self.ttar, zeros_fp32, split=True)
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                self.onnx_tsd[self.ttar] = (onnx.TensorProto.FLOAT, T.shape)
+
+    def create_model(self, X, S_lengths, S_indices, T):
+        #setup tril indices for the interactions
+        offset = 1 if self.arch_interaction_itself else 0
+        num_fea = len(self.emb_l) + 1
+        tril_indices = np.array([j + i * num_fea
+                                 for i in range(num_fea) for j in range(i + offset)])
+        self.FeedBlobWrapper(self.tint + "_tril_indices", tril_indices)
+
+        # create compute graph
+        if T is not None:
+            # WARNING: RunNetOnce call is needed only if we use brew and ConstantFill.
+            # We could use direct calls to self.model functions above to avoid it
+            workspace.RunNetOnce(self.model.param_init_net)
+            workspace.CreateNet(self.model.net)
+            if self.test_net is not None:
+                workspace.CreateNet(self.test_net)
+
+    def run(self, X, S_lengths, S_indices, T, test_net=False, enable_prof=False):
+        # feed input data to blobs
+        # dense features
+        self.FeedBlobWrapper(self.tdin, X, split=True)
+        # sparse features
+        for i in range(len(self.emb_l)):
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            else:
+                d = -1
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            len_s = on_device + self.temb + ":::" + "sls" + str(i) + "_l"
+            ind_s = on_device + self.temb + ":::" + "sls" + str(i) + "_i"
+            self.FeedBlobWrapper(len_s, np.array(S_lengths[i]), False, device_id=d)
+            self.FeedBlobWrapper(ind_s, np.array(S_indices[i]), False, device_id=d)
+
+        # feed target data to blobs if needed
+        if T is not None:
+            self.FeedBlobWrapper(self.ttar, T, split=True)
+            # execute compute graph
+            if test_net:
+                workspace.RunNet(self.test_net)
+            else:
+                if enable_prof:
+                    workspace.C.benchmark_net(self.model.net.Name(), 0, 1, True)
+                else:
+                    workspace.RunNet(self.model.net)
+        # debug prints
+        # print("intermediate")
+        # print(self.FetchBlobWrapper(self.bot_l[-1]))
+        # for tag_emb in self.emb_l:
+        #     print(self.FetchBlobWrapper(tag_emb))
+        # print(self.FetchBlobWrapper(self.tint))
+
+    def MSEloss(self, scale=1.0):
+        # add MSEloss to the model
+        self.AddLayerWrapper(self.model.SquaredL2Distance, [self.tout, self.ttar], "sd")
+        self.AddLayerWrapper(self.model.Scale, "sd", "sd2", scale=2.0 * scale)
+        # WARNING: "loss" is a special tag and should not be changed
+        self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd2", "loss")
+
+    def BCEloss(self, scale=1.0, threshold=0.0):
+        # add BCEloss to the mode
+        if 0.0 < threshold and threshold < 1.0:
+            self.AddLayerWrapper(self.model.Clip, self.tout, "tout_c",
+                                 min=threshold, max=(1.0 - threshold))
+            self.AddLayerWrapper(self.model.MakeTwoClass, "tout_c", "tout_2c")
+        else:
+            self.AddLayerWrapper(self.model.MakeTwoClass, self.tout, "tout_2c")
+        self.AddLayerWrapper(self.model.LabelCrossEntropy, ["tout_2c", self.ttar], "sd")
+        # WARNING: "loss" is a special tag and should not be changed
+        if scale == 1.0:
+            self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd", "loss")
+        else:
+            self.AddLayerWrapper(self.model.Scale, "sd", "sd2", scale=scale)
+            self.loss = self.AddLayerWrapper(self.model.AveragedLoss, "sd2", "loss")
+
+    def sgd_optimizer(self, learning_rate,
+                      T=None, _gradientMap=None, sync_dense_params=True):
+        # create one, it and lr tags (or use them if already present)
+        if T is not None:
+            (tag_one, tag_it, tag_lr) = T
+        else:
+            (tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr")
+
+            # approach 1: feed values directly
+            # self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32))
+            # self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64))
+            # it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it)
+            # lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
+            #                           base_lr=-1 * learning_rate, policy="fixed")
+            # approach 2: use brew
+            self.AddLayerWrapper(self.model.param_init_net.ConstantFill,
+                                 [], tag_one, shape=[1], value=1.0)
+            self.AddLayerWrapper(brew.iter, self.model, tag_it)
+            self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
+                                 base_lr=-1 * learning_rate, policy="fixed")
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,))
+                self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,))
+
+        # create gradient maps (or use them if already present)
+        if _gradientMap is not None:
+            self.gradientMap = _gradientMap
+        else:
+            if self.loss.__class__ == list:
+                self.gradientMap = self.model.AddGradientOperators(self.loss)
+            else:
+                self.gradientMap = self.model.AddGradientOperators([self.loss])
+
+        # update weights
+        # approach 1: builtin function
+        # optimizer.build_sgd(self.model, base_learning_rate=learning_rate)
+        # approach 2: custom code
+        # top MLP weight and bias
+        for w in self.top_w:
+            # allreduce across devices if needed
+            if sync_dense_params and self.ndevices > 1:
+                grad_blobs = [
+                    self.gradientMap["gpu_{}/".format(d) + w]
+                    for d in range(self.ndevices)
+                ]
+                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
+            # update weights
+            self.AddLayerWrapper(self.model.WeightedSum,
+                                 [w, tag_one, "", tag_lr], w, reset_grad=True)
+        # bottom MLP weight and bias
+        for w in self.bot_w:
+            # allreduce across devices if needed
+            if sync_dense_params and self.ndevices > 1:
+                grad_blobs = [
+                    self.gradientMap["gpu_{}/".format(d) + w]
+                    for d in range(self.ndevices)
+                ]
+                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
+            # update weights
+            self.AddLayerWrapper(self.model.WeightedSum,
+                                 [w, tag_one, "", tag_lr], w, reset_grad=True)
+        # update embeddings
+        for i, w in enumerate(self.emb_w):
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            _tag_one = on_device + tag_one
+            _tag_lr = on_device + tag_lr
+            # pickup gradient
+            w_grad = self.gradientMap[w]
+            # update weights
+            if self.ndevices > 1:
+                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+                    self.model.ScatterWeightedSum([w, _tag_one, w_grad.indices,
+                                                w_grad.values, _tag_lr], w)
+            else:
+                self.model.ScatterWeightedSum([w, _tag_one, w_grad.indices,
+                                            w_grad.values, _tag_lr], w)
+
+        # update per sample weights
+        if self.weighted_pooling == "learned":
+            for i, w in enumerate(self.emb_vw):
+                # select device
+                if self.ndevices > 1:
+                    d = i % self.ndevices
+                # create tags
+                on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+                _tag_one = on_device + tag_one
+                _tag_lr = on_device + tag_lr
+                # pickup gradient
+                w_grad = self.gradientMap[w]
+                # update weights
+                if self.ndevices > 1:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, d)
+                    ):
+                        self.model.ScatterWeightedSum(
+                            [w, _tag_one, w_grad.indices,
+                            w_grad.values, _tag_lr], w
+                        )
+                else:
+                    self.model.ScatterWeightedSum(
+                        [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
+                    )
+
+    def adagrad_optimizer(self, learning_rate,
+                        T=None, _gradientMap=None, sync_dense_params=True,
+                        epsilon=1e-10, decay_=0.0, weight_decay_=0.0):
+        # create one, it and lr tags (or use them if already present)
+        if T is not None:
+            (tag_one, tag_it, tag_lr) = T
+        else:
+            (tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr")
+
+            # approach 1: feed values directly
+            # self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32))
+            # self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64))
+            # it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it)
+            # lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
+            #                           base_lr=-1 * learning_rate, policy="fixed")
+            # approach 2: use brew
+            self.AddLayerWrapper(self.model.param_init_net.ConstantFill,
+                                 [], tag_one, shape=[1], value=1.0)
+            self.AddLayerWrapper(brew.iter, self.model, tag_it)
+            self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
+                                 base_lr=-1 * learning_rate, policy="fixed")
+            # save the blob shapes for latter (only needed if onnx is requested)
+            if self.save_onnx:
+                self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,))
+                self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,))
+
+        # create gradient maps (or use them if already present)
+        if _gradientMap is not None:
+            self.gradientMap = _gradientMap
+        else:
+            if self.loss.__class__ == list:
+                self.gradientMap = self.model.AddGradientOperators(self.loss)
+            else:
+                self.gradientMap = self.model.AddGradientOperators([self.loss])
+
+        # update weights
+        # approach 1: builtin function
+        # optimizer.build_sgd(self.model, base_learning_rate=learning_rate)
+        # approach 2: custom code
+        # top MLP weight and bias
+        for i, w in enumerate(self.top_w):
+            # allreduce across devices if needed
+            if sync_dense_params and self.ndevices > 1:
+                grad_blobs = [
+                    self.gradientMap["gpu_{}/".format(d) + w]
+                    for d in range(self.ndevices)
+                ]
+                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
+            # update weights
+            self.model.Adagrad(
+                [
+                    w,
+                    "momentum_mlp_top_{}".format(i + 1),
+                    self.gradientMap[w],
+                    tag_lr
+                ],
+                [w, "momentum_mlp_top_{}".format(i + 1)],
+                epsilon=epsilon,
+                decay_=decay_,
+                weight_decay_=weight_decay_
+            )
+
+        # bottom MLP weight and bias
+        for i, w in enumerate(self.bot_w):
+            # allreduce across devices if needed
+            if sync_dense_params and self.ndevices > 1:
+                grad_blobs = [
+                    self.gradientMap["gpu_{}/".format(d) + w]
+                    for d in range(self.ndevices)
+                ]
+                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
+            # update weights
+            self.model.Adagrad(
+                [
+                    w,
+                    "momentum_mlp_bot_{}".format(i + 1),
+                    self.gradientMap[w],
+                    tag_lr
+                ],
+                [w, "momentum_mlp_bot_{}".format(i + 1)],
+                epsilon=epsilon,
+                decay_=decay_,
+                weight_decay_=weight_decay_
+            )
+
+        # update embeddings
+        for i, w in enumerate(self.emb_w):
+            # select device
+            if self.ndevices > 1:
+                d = i % self.ndevices
+            # create tags
+            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+            _tag_one = on_device + tag_one
+            _tag_lr = on_device + tag_lr
+            # pickup gradient
+            w_grad = self.gradientMap[w]
+            # update weights
+            def add_optimizer():
+                self.model.Unique(
+                    w_grad.indices,
+                    ["unique_w_grad_indices", "remapping_w_grad_indices"]
+                )
+                self.model.UnsortedSegmentSum(
+                    [w_grad.values, "remapping_w_grad_indices"],
+                    "unique_w_grad_values"
+                )
+
+                if self.emb_optimizer == "adagrad":
+                    self.model.SparseAdagrad(
+                        [
+                            w,
+                            "momentum_emb_{}".format(i),
+                            "unique_w_grad_indices",
+                            "unique_w_grad_values",
+                            _tag_lr
+                        ],
+                        [w, "momentum_emb_{}".format(i)],
+                        epsilon=epsilon,
+                        decay_=decay_,
+                        weight_decay_=weight_decay_
+                    )
+
+                elif self.emb_optimizer == "rwsadagrad":
+                    self.model.RowWiseSparseAdagrad(
+                        [
+                            w,
+                            "momentum_emb_{}".format(i),
+                            "unique_w_grad_indices",
+                            "unique_w_grad_values",
+                            _tag_lr
+                        ],
+                        [w, "momentum_emb_{}".format(i)],
+                        epsilon=epsilon,
+                        decay_=decay_,
+                        weight_decay_=weight_decay_
+                    )
+
+            if self.ndevices > 1:
+                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
+                    add_optimizer()
+            else:
+                add_optimizer()
+
+        # update per sample weights
+        if self.weighted_pooling == "learned":
+            for i, w in enumerate(self.emb_vw):
+                # select device
+                if self.ndevices > 1:
+                    d = i % self.ndevices
+                # create tags
+                on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
+                _tag_one = on_device + tag_one
+                _tag_lr = on_device + tag_lr
+                # pickup gradient
+                w_grad = self.gradientMap[w]
+                # update weights
+                if self.ndevices > 1:
+                    with core.DeviceScope(
+                        core.DeviceOption(workspace.GpuDeviceType, d)
+                    ):
+                        self.model.ScatterWeightedSum(
+                            [w, _tag_one, w_grad.indices,
+                            w_grad.values, _tag_lr], w
+                        )
+                else:
+                    self.model.ScatterWeightedSum(
+                        [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
+                    )
+
+    def print_all(self):
+        # approach 1: all
+        print(workspace.Blobs(), end='\n')
+        for _, l in enumerate(workspace.Blobs()):
+            print(l)
+            print(self.FetchBlobWrapper(l))
+        # approach 2: only summary
+        # for param in self.model.params:
+        #    self.model.Summarize(param, [], to_file=1)
+        #    self.model.Summarize(self.model.param_to_grad[param], [], to_file=1)
+
+    def print_weights(self):
+        for _, l in enumerate(self.emb_w):
+            # print(l)
+            print(self.FetchBlobWrapper(l, False))
+        if self.weighted_pooling == "learned":
+            for _, l in enumerate(self.emb_vw):
+                # print(l)
+                print(self.FetchBlobWrapper(l, False))
+        for _, l in enumerate(self.bot_w):
+            # print(l)
+            if self.ndevices > 1:
+                print(self.FetchBlobWrapper(l, False, device_id=0))
+            else:
+                print(self.FetchBlobWrapper(l))
+        for _, l in enumerate(self.top_w):
+            # print(l)
+            if self.ndevices > 1:
+                print(self.FetchBlobWrapper(l, False, device_id=0))
+            else:
+                print(self.FetchBlobWrapper(l))
+
+    def print_activations(self):
+        for _, l in enumerate(self.emb_l):
+            print(l)
+            print(self.FetchBlobWrapper(l, False))
+        for _, l in enumerate(self.bot_l):
+            print(l)
+            print(self.FetchBlobWrapper(l))
+        print(self.tint)
+        print(self.FetchBlobWrapper(self.tint))
+        for _, l in enumerate(self.top_l):
+            print(l)
+            print(self.FetchBlobWrapper(l))
+
+
+def define_metrics():
+    metrics = {
+        'loss': lambda y_true, y_score:
+        sklearn.metrics.log_loss(
+            y_true=y_true,
+            y_pred=y_score,
+            labels=[0,1]),
+        'recall': lambda y_true, y_score:
+        sklearn.metrics.recall_score(
+            y_true=y_true,
+            y_pred=np.round(y_score)
+        ),
+        'precision': lambda y_true, y_score:
+        sklearn.metrics.precision_score(
+            y_true=y_true,
+            y_pred=np.round(y_score)
+        ),
+        'f1': lambda y_true, y_score:
+        sklearn.metrics.f1_score(
+            y_true=y_true,
+            y_pred=np.round(y_score)
+        ),
+        'ap': sklearn.metrics.average_precision_score,
+        'roc_auc': sklearn.metrics.roc_auc_score,
+        'accuracy': lambda y_true, y_score:
+        sklearn.metrics.accuracy_score(
+            y_true=y_true,
+            y_pred=np.round(y_score)
+        ),
+        # 'pre_curve' : sklearn.metrics.precision_recall_curve,
+        # 'roc_curve' :  sklearn.metrics.roc_curve,
+    }
+    return metrics
+
+
+def calculate_metrics(targets, scores):
+    scores = np.concatenate(scores, axis=0)
+    targets = np.concatenate(targets, axis=0)
+
+    metrics = define_metrics()
+
+    # print("Compute time for validation metric : ", end="")
+    # first_it = True
+    validation_results = {}
+    for metric_name, metric_function in metrics.items():
+        # if first_it:
+        #     first_it = False
+        # else:
+        #     print(", ", end="")
+        # metric_compute_start = time_wrap(False)
+        try:
+            validation_results[metric_name] = metric_function(
+                targets,
+                scores
+            )
+        except Exception as error :
+            validation_results[metric_name] = -1
+            print("{} in calculating {}".format(error, metric_name))
+        # metric_compute_end = time_wrap(False)
+        # met_time = metric_compute_end - metric_compute_start
+        # print("{} {:.4f}".format(metric_name, 1000 * (met_time)),
+        #      end="")
+    # print(" ms")
+    return validation_results
+
+
+if __name__ == "__main__":
+    ### import packages ###
+    import sys
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Train Deep Learning Recommendation Model (DLRM)"
+    )
+    # model related parameters
+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
+    parser.add_argument("--arch-embedding-size", type=str, default="4-3-2")
+    parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2")
+    parser.add_argument("--arch-mlp-top", type=str, default="4-2-1")
+    parser.add_argument("--arch-interaction-op", type=str, default="dot")
+    parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
+    # activations and loss
+    parser.add_argument("--activation-function", type=str, default="relu")
+    parser.add_argument("--loss-function", type=str, default="mse")   # or bce
+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
+    parser.add_argument("--round-targets", type=bool, default=False)
+    parser.add_argument("--weighted-pooling", type=str, default=None)
+    # data
+    parser.add_argument("--data-size", type=int, default=1)
+    parser.add_argument("--num-batches", type=int, default=0)
+    parser.add_argument("--data-generation", type=str, default="random")  # or synthetic or dataset
+    parser.add_argument("--rand-data-dist", type=str, default="uniform")  # uniform or gaussian
+    parser.add_argument("--rand-data-min", type=float, default=0)
+    parser.add_argument("--rand-data-max", type=float, default=1)
+    parser.add_argument("--rand-data-mu", type=float, default=-1)
+    parser.add_argument("--rand-data-sigma", type=float, default=1)
+    parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--num-indices-per-lookup", type=int, default=10)
+    parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    # training
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--nepochs", type=int, default=1)
+    parser.add_argument("--learning-rate", type=float, default=0.01)
+    parser.add_argument("--print-precision", type=int, default=5)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--sync-dense-params", type=bool, default=True)
+    parser.add_argument("--caffe2-net-type", type=str, default="")
+    parser.add_argument("--optimizer", type=str, default="sgd",
+        help="""This is the optimizer for embedding tables.""")
+    parser.add_argument(
+        "--dataset-multiprocessing",
+        action="store_true",
+        default=False,
+        help="The Kaggle dataset can be multiprocessed in an environment \
+                        with more than 7 CPU cores and more than 20 GB of memory. \n \
+                        The Terabyte dataset can be multiprocessed in an environment \
+                        with more than 24 CPU cores and at least 1 TB of memory.",
+    )
+    # inference
+    parser.add_argument("--inference-only", action="store_true", default=False)
+    # onnx (or protobuf with shapes)
+    parser.add_argument("--save-onnx", action="store_true", default=False)
+    parser.add_argument("--save-proto-types-shapes", action="store_true", default=False)
+    # gpu
+    parser.add_argument("--use-gpu", action="store_true", default=False)
+    # debugging and profiling
+    parser.add_argument("--print-freq", type=int, default=1)
+    parser.add_argument("--test-freq", type=int, default=-1)
+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
+    parser.add_argument("--test-num-workers", type=int, default=-1)
+    parser.add_argument("--print-time", action="store_true", default=False)
+    parser.add_argument("--debug-mode", action="store_true", default=False)
+    parser.add_argument("--enable-profiling", action="store_true", default=False)
+    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+    # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
+    parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
+    # stop at target AUC Terabyte (no subsampling) 0.8025
+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
+    args = parser.parse_args()
+
+    if args.dataset_multiprocessing:
+        assert float(sys.version[:3]) > 3.7, "The dataset_multiprocessing " + \
+        "flag is susceptible to a bug in Python 3.7 and under. " + \
+        "https://github.com/facebookresearch/dlrm/issues/172"
+
+    ### some basic setup ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    np.random.seed(args.numpy_rand_seed)
+
+    np.set_printoptions(precision=args.print_precision)
+    if (args.test_mini_batch_size < 0):
+        # if the parameter is not set, use the training batch size
+        args.test_mini_batch_size = args.mini_batch_size
+    if (args.test_num_workers < 0):
+        # if the parameter is not set, use the same parameter for training
+        args.test_num_workers = args.num_workers
+
+    use_gpu = args.use_gpu
+    if use_gpu:
+        device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
+        ngpus = workspace.NumGpuDevices()  # 1
+        print("Using {} GPU(s)...".format(ngpus))
+    else:
+        device_opt = core.DeviceOption(caffe2_pb2.CPU)
+        print("Using CPU...")
+
+    ### prepare training data ###
+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
+    if args.data_generation == "dataset":
+        if args.num_workers > 0 or args.test_num_workers > 0:
+            print("WARNING: non default --num-workers or --test-num-workers options"
+                    + " are not supported and will be ignored")
+        if args.mini_batch_size != args.test_mini_batch_size:
+            print("WARNING: non default ----test-mini-batch-size option"
+                    + " is not supported and will be ignored")
+
+        # input and target from dataset
+
+        train_data, train_ld, test_data, test_ld = \
+            dp.make_criteo_data_and_loaders(
+                args,
+                offset_to_length_converter=True,
+            )
+
+        nbatches = args.num_batches if args.num_batches > 0 \
+            else len(train_ld)
+
+        nbatches_test = len(test_ld)
+
+        ln_emb = train_data.counts
+        m_den = train_data.m_den
+
+        # enforce maximum limit on number of vectors per embedding
+        if args.max_ind_range > 0:
+            ln_emb = np.array(list(map(
+                lambda x: x if x < args.max_ind_range else args.max_ind_range,
+                ln_emb
+            )))
+        ln_bot[0] = m_den
+
+    else:
+        if args.num_workers > 0 or args.test_num_workers > 0:
+            print("WARNING: non default --num-workers or --test-num-workers options"
+                  + " are not supported and will be ignored")
+        if args.mini_batch_size != args.test_mini_batch_size:
+            print("WARNING: non default ----test-mini-batch-size option"
+                  + " is not supported and will be ignored")
+
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld, test_data, test_ld = dp.make_random_data_and_loader(args, ln_emb, m_den, \
+            offset_to_length_converter=True,
+        )
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        nbatches_test = len(test_ld)
+        # table_feature_map = {idx : idx for idx in range(len(ln_emb))}
+
+    ### parse command line arguments ###
+    m_spa = args.arch_sparse_feature_size
+    ln_emb = np.asarray(ln_emb)
+    num_fea = ln_emb.size + 1  # num sparse + num dense features
+    m_den_out = ln_bot[ln_bot.size - 1]
+    if args.arch_interaction_op == "dot":
+        # approach 1: all
+        # num_int = num_fea * num_fea + m_den_out
+        # approach 2: unique
+        if args.arch_interaction_itself:
+            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
+        else:
+            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
+    elif args.arch_interaction_op == "cat":
+        num_int = num_fea * m_den_out
+    else:
+        sys.exit("ERROR: --arch-interaction-op="
+                 + args.arch_interaction_op + " is not supported")
+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
+    # sanity check: feature sizes and mlp dimensions must match
+    if m_den != ln_bot[0]:
+        sys.exit("ERROR: arch-dense-feature-size "
+            + str(m_den) + " does not match first dim of bottom mlp " + str(ln_bot[0]))
+    if m_spa != m_den_out:
+        sys.exit("ERROR: arch-sparse-feature-size "
+            + str(m_spa) + " does not match last dim of bottom mlp " + str(m_den_out))
+    if num_int != ln_top[0]:
+        sys.exit("ERROR: # of feature interactions "
+            + str(num_int) + " does not match first dim of top mlp " + str(ln_top[0]))
+
+    # test prints (model arch)
+    if args.debug_mode:
+        print("model arch:")
+        print("mlp top arch " + str(ln_top.size - 1)
+              + " layers, with input to output dimensions:")
+        print(ln_top)
+
+        print("# of interactions")
+        print(num_int)
+        print("mlp bot arch " + str(ln_bot.size - 1)
+              + " layers, with input to output dimensions:")
+        print(ln_bot)
+        print("# of features (sparse and dense)")
+        print(num_fea)
+        print("dense feature size")
+        print(m_den)
+        print("sparse feature size")
+        print(m_spa)
+        print("# of embeddings (= # of sparse features) " + str(ln_emb.size)
+              + ", with dimensions " + str(m_spa) + "x:")
+        print(ln_emb)
+
+        print("data (inputs and targets):")
+        for j, inputBatch in enumerate(train_ld):
+            lX_j, lS_l_j, lS_i_j, lT_j = inputBatch
+            print("mini-batch: %d" % j)
+            print(lX_j)
+            print(lS_l_j)
+            print(lS_i_j)
+            print(lT_j)
+
+    ### construct the neural network specified above ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    # np.random.seed(args.numpy_rand_seed)
+    ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
+    flag_types_shapes = args.save_onnx or args.save_proto_types_shapes
+    flag_forward_ops = not (use_gpu and ndevices > 1)
+    with core.DeviceScope(device_opt):
+        dlrm = DLRM_Net(
+            m_spa,
+            ln_emb,
+            ln_bot,
+            ln_top,
+            args.arch_interaction_op,
+            arch_interaction_itself=args.arch_interaction_itself,
+            sigmoid_bot=-1,
+            sigmoid_top=ln_top.size - 1,
+            save_onnx=flag_types_shapes,
+            ndevices=ndevices,
+            # forward_ops = flag_forward_ops
+            enable_prof=args.enable_profiling,
+            weighted_pooling=args.weighted_pooling,
+            emb_optimizer=args.optimizer
+        )
+    # load nccl if using multiple devices
+    if args.sync_dense_params and ndevices > 1:
+        dyndep.InitOpsLibrary("//caffe2/caffe2/contrib/nccl:nccl_ops")
+    # set the net type for better performance (dag, async_scheduling, etc)
+    if args.caffe2_net_type:
+        dlrm.parameters().net.Proto().type = args.caffe2_net_type
+    # plot compute graph
+    if args.plot_compute_graph:
+        graph = net_drawer.GetPydotGraph(
+            dlrm.parameters().net,
+            "dlrm_s_caffe2_graph",
+            "BT"
+        )
+        graph.write_pdf(graph.get_name() + ".pdf")
+    # test prints
+    if args.debug_mode:
+        print("initial parameters (weights and bias):")
+        dlrm.print_weights()
+
+    # add training loss if needed
+    if not args.inference_only:
+        with core.DeviceScope(device_opt):
+            # specify the loss function
+            nd = 1.0 if dlrm.ndevices <= 1 else 1.0 / dlrm.ndevices  # 1
+            if args.loss_function == "mse":
+                dlrm.MSEloss(scale=nd)
+            elif args.loss_function == "bce":
+                dlrm.BCEloss(scale=nd, threshold=args.loss_threshold)
+            else:
+                sys.exit("ERROR: --loss-function=" + args.loss_function
+                         + " is not supported")
+
+            # define test net (as train net without gradients)
+            dlrm.test_net = core.Net(copy.deepcopy(dlrm.model.net.Proto()))
+
+            # specify the optimizer algorithm
+            if args.optimizer == "sgd":
+                dlrm.sgd_optimizer(
+                    args.learning_rate, sync_dense_params=args.sync_dense_params
+                )
+            elif args.optimizer in ["adagrad", "rwsadagrad"]:
+                dlrm.adagrad_optimizer(
+                    args.learning_rate, sync_dense_params=args.sync_dense_params
+                )
+            else:
+                sys.exit("""ERROR: Select an optimizer for
+                                embedding tables : 'sgd', 'adagrad',
+                                or 'rwsadagrad' """)
+
+    # init/create
+    X, lS_l, lS_i, T = next(iter(train_ld)) # does not affect the enumerate(train_ld) in the main loop
+    dlrm.create(X, lS_l, lS_i, T.int())
+
+    ### main loop ###
+    best_gA_test = 0
+    best_auc_test = 0
+    total_time = 0
+    total_loss = 0
+    total_accu = 0
+    total_iter = 0
+    total_samp = 0
+    k = 0
+
+    print("time/loss/accuracy (if enabled):")
+    while k < args.nepochs:
+        j = 0
+        for j, inputBatch in enumerate(train_ld):
+            # forward and backward pass, where the latter runs only
+            # when gradients and loss have been added to the net
+            time1 = time.time()
+            lX_j, lS_l_j, lS_i_j, lT_j = inputBatch
+            lT_j = lT_j.int() if args.loss_function == "bce" else lT_j
+            dlrm.run(lX_j, lS_l_j, lS_i_j, lT_j)
+
+            time2 = time.time()
+            total_time += time2 - time1
+
+            # compte loss and accuracy
+            Z = dlrm.get_output()  # numpy array
+            T = lT_j.numpy()
+            '''
+            # debug prints
+            print("output and loss")
+            print(Z)
+            print(dlrm.get_loss())
+            '''
+            mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
+            A = np.sum((np.round(Z, 0) == T).astype(np.uint8))
+            total_accu += 0 if args.inference_only else A
+            total_loss += 0 if args.inference_only else dlrm.get_loss() * mbs
+            total_iter += 1
+            total_samp += mbs
+
+            # print time, loss and accuracy
+            should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches)
+            should_test = (
+                (args.test_freq > 0)
+                and (args.data_generation in ["dataset", "random"])
+                and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
+            )
+            if should_print or should_test:
+                gT = 1000. * total_time / total_iter if args.print_time else -1
+                total_time = 0
+
+                gA = total_accu / total_samp
+                total_accu = 0
+
+                gL = total_loss / total_samp
+                total_loss = 0
+
+                str_run_type = "inference" if args.inference_only else "training"
+                print(
+                    "Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format(
+                        str_run_type, j + 1, nbatches, k, gT
+                    )
+                    + " loss {:.6f}".format(gL)
+                )
+                total_iter = 0
+                total_samp = 0
+                # debug prints
+                # print(Z)
+                # print(T)
+
+                # testing
+                if should_test and not args.inference_only:
+                    # don't measure training iter time in a test iteration
+                    if args.mlperf_logging:
+                        previous_iteration_time = None
+
+                    test_accu = 0
+                    test_loss = 0
+                    test_samp = 0
+
+                    if args.mlperf_logging:
+                        scores = []
+                        targets = []
+
+                    for i, testBatch in enumerate(test_ld):
+                        # early exit if nbatches was set by the user and was exceeded
+                        if nbatches > 0 and i >= nbatches:
+                            break
+
+                        # forward pass
+
+                        lX_test_i, lS_l_test_i, lS_i_test_i, lT_test_i = testBatch
+                        lT_test_i = lT_test_i.int() if args.loss_function == "bce" else lT_test_i
+                        dlrm.run(lX_test_i, lS_l_test_i, lS_i_test_i, lT_test_i, test_net=True)
+
+                        Z_test = dlrm.get_output()
+                        T_test = lT_test_i.numpy()
+
+                        if args.mlperf_logging:
+                            scores.append(Z_test)
+                            targets.append(T_test)
+                        else:
+                            # compte loss and accuracy
+                            L_test = dlrm.get_loss()
+                            mbs_test = T_test.shape[0]  # = mini_batch_size except last
+                            A_test = np.sum((np.round(Z_test, 0) == T_test).astype(np.uint8))
+                            test_accu += A_test
+                            test_loss += L_test * mbs_test
+                            test_samp += mbs_test
+
+                    # compute metrics (after test loop has finished)
+                    if args.mlperf_logging:
+                        validation_results = calculate_metrics(targets, scores)
+                        gA_test = validation_results['accuracy']
+                        gL_test = validation_results['loss']
+                    else:
+                        gA_test = test_accu / test_samp
+                        gL_test = test_loss / test_samp
+
+                    # print metrics
+                    is_best = gA_test > best_gA_test
+                    if is_best:
+                        best_gA_test = gA_test
+
+                    if args.mlperf_logging:
+                        is_best = validation_results['roc_auc'] > best_auc_test
+                        if is_best:
+                            best_auc_test = validation_results['roc_auc']
+
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
+                            + " loss {:.6f}, recall {:.4f}, precision {:.4f},".format(
+                                validation_results['loss'],
+                                validation_results['recall'],
+                                validation_results['precision']
+                            )
+                            + " f1 {:.4f}, ap {:.4f},".format(
+                                validation_results['f1'],
+                                validation_results['ap'],
+                            )
+                            + " auc {:.4f}, best auc {:.4f},".format(
+                                validation_results['roc_auc'],
+                                best_auc_test
+                            )
+                            + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
+                                validation_results['accuracy'] * 100,
+                                best_gA_test * 100
+                            )
+                        )
+                    else:
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0)
+                            + " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format(
+                                gL_test, gA_test * 100, best_gA_test * 100
+                            )
+                        )
+
+                    # check thresholds
+                    if (args.mlperf_logging
+                        and (args.mlperf_acc_threshold > 0)
+                        and (best_gA_test > args.mlperf_acc_threshold)):
+                        print("MLPerf testing accuracy threshold "
+                              + str(args.mlperf_acc_threshold)
+                              + " reached, stop training")
+                        break
+
+                    if (args.mlperf_logging
+                        and (args.mlperf_auc_threshold > 0)
+                        and (best_auc_test > args.mlperf_auc_threshold)):
+                        print("MLPerf testing auc threshold "
+                              + str(args.mlperf_auc_threshold)
+                              + " reached, stop training")
+                        break
+
+            j += 1  # nbatches
+        k += 1  # nepochs
+
+    # test prints
+    if not args.inference_only and args.debug_mode:
+        print("updated parameters (weights and bias):")
+        dlrm.print_weights()
+
+    # build onnx model from caffe2
+    if args.save_onnx:
+        pnet = dlrm.parameters().net.Proto()
+        inet = dlrm.parameters().param_init_net.Proto()
+        value_info = dlrm.onnx_tsd  # None
+        # debug prints
+        # print(value_info)
+
+        # WARNING: Why Caffe2 to ONNX net transformation currently does not work?
+        # 1. ONNX does not support SparseLengthsSum operator directly. A workaround
+        # could be for the Caffe2 ONNX frontend to indirectly map this operator to
+        # Gather and ReducedSum ONNX operators, following the PyTorch approach.
+        c2f = caffe2.python.onnx.frontend.Caffe2Frontend()
+        dlrm_caffe2_onnx = c2f.caffe2_net_to_onnx_model(pnet, inet, value_info)
+        # check the onnx model
+        onnx.checker.check_model(dlrm_caffe2_onnx)
+
+        # save model to a file
+        with open("dlrm_s_caffe2.onnx", "w+") as dlrm_caffe2_onnx_file:
+            dlrm_caffe2_onnx_file.write(str(dlrm_caffe2_onnx))
+
+    # build protobuf with types and shapes
+    if args.save_proto_types_shapes:
+        # add types and shapes to protobuf
+        __TYPE_MAPPING = {
+            onnx.TensorProto.FLOAT: caffe2_pb2.TensorProto.FLOAT,
+            onnx.TensorProto.UINT8: caffe2_pb2.TensorProto.UINT8,
+            onnx.TensorProto.INT8: caffe2_pb2.TensorProto.INT8,
+            onnx.TensorProto.UINT16: caffe2_pb2.TensorProto.UINT16,
+            onnx.TensorProto.INT16: caffe2_pb2.TensorProto.INT16,
+            onnx.TensorProto.INT32: caffe2_pb2.TensorProto.INT32,
+            onnx.TensorProto.INT64: caffe2_pb2.TensorProto.INT64,
+            onnx.TensorProto.STRING: caffe2_pb2.TensorProto.STRING,
+            onnx.TensorProto.BOOL: caffe2_pb2.TensorProto.BOOL,
+            onnx.TensorProto.FLOAT16: caffe2_pb2.TensorProto.FLOAT16,
+            onnx.TensorProto.DOUBLE: caffe2_pb2.TensorProto.DOUBLE,
+        }
+
+        pnet = dlrm.parameters().net.Proto()
+        arg = pnet.arg.add()
+        arg.name = "input_shape_info"
+        for i in pnet.external_input:
+            if i in dlrm.onnx_tsd:
+                onnx_dtype, shape = dlrm.onnx_tsd[i]
+                t = arg.tensors.add()
+                t.name = i
+                t.data_type = __TYPE_MAPPING[onnx_dtype]
+                t.dims.extend(shape)
+            else:
+                print("Warning: we don't have shape/type info for input: {}".format(i))
+        # debug print
+        # print(pnet)
+
+        # export the protobuf with types and shapes
+        with open("dlrm_s_caffe2.proto", "w+") as dlrm_s_proto_file:
+            dlrm_s_proto_file.write(str(pnet))
+
+        """
+        # export the protobuf with types and shapes as well as weights
+        # see https://github.com/pytorch/pytorch/issues/9533
+        #save
+        net = dlrm.parameters().net
+        params = dlrm.parameters().params
+        init_net, predict_net = mobile_exporter.Export(workspace, net, params)
+        with open("dlrm_s_caffe2.predict", "wb") as dlrm_s_predict_file:
+            dlrm_s_predict_file.write(predict_net.SerializeToString())
+        with open("dlrm_s_caffe2.init", "wb") as dlrm_s_init_file:
+            dlrm_s_init_file.write(init_net.SerializeToString())
+        #load
+        net_def = caffe2_pb2.NetDef()
+        init_def= caffe2_pb2.NetDef()
+        with open("dlrm_s_caffe2.predict", "rb") as dlrm_s_predict_file:
+            net_def.ParseFromString(dlrm_s_predict_file.read())
+            print(net_def)
+        with open("dlrm_s_caffe2.init", "rb") as dlrm_s_init_file:
+            init_def.ParseFromString(dlrm_s_init_file.read())
+            print(init_def)
+        """
diff --git a/benchmarks/dlrm/ootb/dlrm_s_pytorch.py b/benchmarks/dlrm/ootb/dlrm_s_pytorch.py
new file mode 100644
index 0000000..1774eb4
--- /dev/null
+++ b/benchmarks/dlrm/ootb/dlrm_s_pytorch.py
@@ -0,0 +1,2511 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: an implementation of a deep learning recommendation model (DLRM)
+# The model input consists of dense and sparse features. The former is a vector
+# of floating point values. The latter is a list of sparse indices into
+# embedding tables, which consist of vectors of floating point values.
+# The selected vectors are passed to mlp networks denoted by triangles,
+# in some cases the vectors are interacted through operators (Ops).
+#
+# output:
+#                         vector of values
+# model:                        |
+#                              /\
+#                             /__\
+#                               |
+#       _____________________> Op  <___________________
+#     /                         |                      \
+#    /\                        /\                      /\
+#   /__\                      /__\           ...      /__\
+#    |                          |                       |
+#    |                         Op                      Op
+#    |                    ____/__\_____           ____/__\____
+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+# input:
+# [ dense features ]     [sparse indices] , ..., [sparse indices]
+#
+# More precise definition of model layers:
+# 1) fully connected layers of an mlp
+# z = f(y)
+# y = Wx + b
+#
+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+# z = Op(e1,...,ek)
+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+#
+# 3) Operator Op can be one of the following
+# Sum(e1,...,ek) = e1 + ... + ek
+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+# Cat(e1,...,ek) = [e1', ..., ek']'
+# where ' denotes transpose operation
+#
+# References:
+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
+
+# TERMS:
+#
+# qr_       quotient-remainder trick
+# md_       mixed-dimension trick
+# lS_i      Indices used as inputs to embedding bag operators. Indices determine
+#           which embeddings to select.
+# lS_o      Offsets used as inputs to embedding bag operators. Offsets determine how
+#           the selected embeddings are grouped together for the 'mode' operation.
+#           (Mode operation examples: sum, mean, max)
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+
+# miscellaneous
+import builtins
+import datetime
+import json
+import sys
+import time
+import itertools
+import traceback
+
+# onnx
+# The onnx import causes deprecation warnings every time workers
+# are spawned during testing. So, we filter out those warnings.
+import warnings
+
+# data generation
+import dlrm_data_pytorch as dp
+
+# For distributed run
+import extend_distributed as ext_dist
+import mlperf_logger
+
+# numpy
+import numpy as np
+import optim.rwsadagrad as RowWiseSparseAdagrad
+import sklearn.metrics
+
+# pytorch
+import torch
+import torch.nn as nn
+from torch._ops import ops
+from torch.autograd.profiler import record_function
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.scatter_gather import gather, scatter
+from torch.nn.parameter import Parameter
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.tensorboard import SummaryWriter
+
+try:
+    import fbgemm_gpu
+    from fbgemm_gpu import split_table_batched_embeddings_ops
+    from fbgemm_gpu.split_table_batched_embeddings_ops import (
+        CacheAlgorithm,
+        PoolingMode,
+        OptimType,
+        SparseType,
+        SplitTableBatchedEmbeddingBagsCodegen,
+        IntNBitTableBatchedEmbeddingBagsCodegen,
+    )
+except (ImportError, OSError):
+    fbgemm_gpu_import_error_msg = traceback.format_exc()
+    fbgemm_gpu = None
+
+try:
+    import apex
+except (ImportError, OSError):
+    apex_import_error_msg = traceback.format_exc()
+    apex = None
+
+try:
+    import torch2trt
+    from torch2trt import torch2trt
+except (ImportError, OSError):
+    torch2trt_import_error_msg = traceback.format_exc()
+    torch2trt = None
+
+# mixed-dimension trick
+from tricks.md_embedding_bag import PrEmbeddingBag, md_solver
+
+# FB5 Logger
+import pathlib
+from os import fspath
+p = pathlib.Path(__file__).parent.resolve() / "../../../fb5logging"
+sys.path.append(fspath(p))
+from fb5logger import FB5Logger
+import loggerconstants
+
+# quotient-remainder trick
+from tricks.qr_embedding_bag import QREmbeddingBag
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    try:
+        import onnx
+    except ImportError as error:
+        print("Unable to import onnx. ", error)
+
+# from torchviz import make_dot
+# import torch.nn.functional as Functional
+# from torch.nn.parameter import Parameter
+
+exc = getattr(builtins, "IOError", "FileNotFoundError")
+
+
+def time_wrap(use_gpu):
+    if use_gpu:
+        torch.cuda.synchronize()
+    return time.time()
+
+
+def dlrm_wrap(X, lS_o, lS_i, use_gpu, device, ndevices=1):
+    with record_function("DLRM forward"):
+        if use_gpu:  # .cuda()
+            # lS_i can be either a list of tensors or a stacked tensor.
+            # Handle each case below:
+            if ndevices == 1:
+                lS_i = (
+                    [S_i.to(device) for S_i in lS_i]
+                    if isinstance(lS_i, list)
+                    else lS_i.to(device)
+                )
+                lS_o = (
+                    [S_o.to(device) for S_o in lS_o]
+                    if isinstance(lS_o, list)
+                    else lS_o.to(device)
+                )
+        return dlrm(X.to(device), lS_o, lS_i)
+
+
+def loss_fn_wrap(Z, T, use_gpu, device):
+    with record_function("DLRM loss compute"):
+        if args.loss_function == "mse" or args.loss_function == "bce":
+            return dlrm.loss_fn(Z, T.to(device))
+        elif args.loss_function == "wbce":
+            loss_ws_ = dlrm.loss_ws[T.data.view(-1).long()].view_as(T).to(device)
+            loss_fn_ = dlrm.loss_fn(Z, T.to(device))
+            loss_sc_ = loss_ws_ * loss_fn_
+            return loss_sc_.mean()
+
+
+# The following function is a wrapper to avoid checking this multiple times in th
+# loop below.
+def unpack_batch(b):
+    # Experiment with unweighted samples
+    return b[0], b[1], b[2], b[3], torch.ones(b[3].size()), None
+
+
+class LRPolicyScheduler(_LRScheduler):
+    def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
+        self.num_warmup_steps = num_warmup_steps
+        self.decay_start_step = decay_start_step
+        self.decay_end_step = decay_start_step + num_decay_steps
+        self.num_decay_steps = num_decay_steps
+
+        if self.decay_start_step < self.num_warmup_steps:
+            sys.exit("Learning rate warmup must finish before the decay starts")
+
+        super(LRPolicyScheduler, self).__init__(optimizer)
+
+    def get_lr(self):
+        step_count = self._step_count
+        if step_count < self.num_warmup_steps:
+            # warmup
+            scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
+            lr = [base_lr * scale for base_lr in self.base_lrs]
+            self.last_lr = lr
+        elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
+            # decay
+            decayed_steps = step_count - self.decay_start_step
+            scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
+            min_lr = 0.0000001
+            lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
+            self.last_lr = lr
+        else:
+            if self.num_decay_steps > 0:
+                # freeze at last, either because we're after decay
+                # or because we're between warmup and decay
+                lr = self.last_lr
+            else:
+                # do not adjust
+                lr = self.base_lrs
+        return lr
+
+
+# quantize_fbgemm_gpu_embedding_bag is partially lifted from
+# fbgemm_gpu/test/split_embedding_inference_converter.py, def _quantize_split_embs.
+# Converts SplitTableBatchedEmbeddingBagsCodegen to IntNBitTableBatchedEmbeddingBagsCodegen
+def quantize_fbgemm_gpu_embedding_bag(model, quantize_type, device):
+    embedding_specs = []
+    if device.type == "cpu":
+        emb_location = split_table_batched_embeddings_ops.EmbeddingLocation.HOST
+    else:
+        emb_location = split_table_batched_embeddings_ops.EmbeddingLocation.DEVICE
+
+    for (E, D, _, _) in model.embedding_specs:
+        weights_ty = quantize_type
+        if D % weights_ty.align_size() != 0:
+            assert D % 4 == 0
+            weights_ty = (
+                SparseType.FP16
+            )  # fall back to FP16 if dimension couldn't be aligned with the required size
+        embedding_specs.append(("", E, D, weights_ty, emb_location))
+
+    q_model = (
+        split_table_batched_embeddings_ops.IntNBitTableBatchedEmbeddingBagsCodegen(
+            embedding_specs=embedding_specs,
+            pooling_mode=model.pooling_mode,
+            device=device,
+        )
+    )
+    q_model.initialize_weights()
+    for t, (_, _, _, weight_ty, _) in enumerate(embedding_specs):
+        if weight_ty == SparseType.FP16:
+            original_weight = model.split_embedding_weights()[t]
+            q_weight = original_weight.half()
+            weights = torch.tensor(q_weight.cpu().numpy().view(np.uint8))
+            q_model.split_embedding_weights()[t][0].data.copy_(weights)
+
+        elif weight_ty == SparseType.INT8:
+            original_weight = model.split_embedding_weights()[t]
+            q_weight = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
+                original_weight
+            )
+            weights = q_weight[:, :-8]
+            scale_shift = torch.tensor(
+                q_weight[:, -8:]
+                .contiguous()
+                .cpu()
+                .numpy()
+                .view(np.float32)
+                .astype(np.float16)
+                .view(np.uint8)
+            )
+            q_model.split_embedding_weights()[t][0].data.copy_(weights)
+            q_model.split_embedding_weights()[t][1].data.copy_(scale_shift)
+
+        elif weight_ty == SparseType.INT4 or weight_ty == SparseType.INT2:
+            original_weight = model.split_embedding_weights()[t]
+            q_weight = torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
+                original_weight,
+                bit_rate=quantize_type.bit_rate(),
+            )
+            weights = q_weight[:, :-4]
+            scale_shift = torch.tensor(
+                q_weight[:, -4:].contiguous().cpu().numpy().view(np.uint8)
+            )
+            q_model.split_embedding_weights()[t][0].data.copy_(weights)
+            q_model.split_embedding_weights()[t][1].data.copy_(scale_shift)
+    return q_model
+
+
+def create_fbgemm_gpu_emb_bag(
+    device,
+    emb_l,
+    m_spa,
+    quantize_bits,
+    learning_rate,
+    codegen_preference=None,
+    requires_grad=True,
+):
+    if isinstance(emb_l[0], PrEmbeddingBag):
+        emb_l = [e.embs for e in emb_l]
+    if isinstance(emb_l[0], nn.EmbeddingBag):
+        emb_l = [e.weight for e in emb_l]
+    Es = [e.shape[0] for e in emb_l]
+
+    if isinstance(m_spa, list):
+        Ds = m_spa
+    else:
+        Ds = [m_spa for _ in emb_l]
+
+    if device.type == "cpu":
+        emb_location = split_table_batched_embeddings_ops.EmbeddingLocation.HOST
+        compute_device = split_table_batched_embeddings_ops.ComputeDevice.CPU
+    else:
+        emb_location = split_table_batched_embeddings_ops.EmbeddingLocation.DEVICE
+        compute_device = split_table_batched_embeddings_ops.ComputeDevice.CUDA
+    pooling_mode = PoolingMode.SUM
+    cache_algorithm = CacheAlgorithm.LRU
+
+    sparse_type_dict = {
+        4: SparseType.INT4,
+        8: SparseType.INT8,
+        16: SparseType.FP16,
+        32: SparseType.FP32,
+    }
+    codegen_type_dict = {
+        4: "IntN",
+        8: "Split" if codegen_preference != "IntN" else "IntN",
+        16: "Split" if codegen_preference != "IntN" else "IntN",
+        32: "Split",
+    }
+
+    codegen_type = codegen_type_dict[quantize_bits]
+    quantize_type = sparse_type_dict[quantize_bits]
+    if codegen_type == "IntN":
+        # Create non-quantized model and then call quantize_fbgemm_gpu_embedding_bag
+        fbgemm_gpu_emb_bag = SplitTableBatchedEmbeddingBagsCodegen(
+            embedding_specs=[
+                (
+                    E,  # num of rows in the table
+                    D,  # num of columns in the table
+                    split_table_batched_embeddings_ops.EmbeddingLocation.HOST,
+                    split_table_batched_embeddings_ops.ComputeDevice.CPU,
+                )
+                for (E, D) in zip(Es, Ds)
+            ],
+            weights_precision=SparseType.FP32,
+            optimizer=OptimType.EXACT_SGD,
+            learning_rate=learning_rate,
+            cache_algorithm=cache_algorithm,
+            pooling_mode=pooling_mode,
+        ).to(device)
+        if quantize_type == quantize_type.FP16:
+            weights = fbgemm_gpu_emb_bag.split_embedding_weights()
+            for i, emb in enumerate(weights):
+                emb.data.copy_(emb_l[i])
+
+        elif quantize_type == quantize_type.INT8:
+            # copy quantized values upsampled/recasted to FP32
+            for i in range(len(Es)):
+                fbgemm_gpu_emb_bag.split_embedding_weights()[i].data.copy_(
+                    torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloat(emb_l[i])
+                )
+        elif quantize_type == quantize_type.INT4:
+            # copy quantized values upsampled/recasted to FP32
+            for i in range(len(Es)):
+                fbgemm_gpu_emb_bag.split_embedding_weights()[i].data.copy_(
+                    torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloat(
+                        emb_l[i],
+                        bit_rate=quantize_type.bit_rate(),
+                    )
+                )
+        fbgemm_gpu_emb_bag = quantize_fbgemm_gpu_embedding_bag(
+            fbgemm_gpu_emb_bag, quantize_type, device
+        )
+    else:
+        fbgemm_gpu_emb_bag = SplitTableBatchedEmbeddingBagsCodegen(
+            embedding_specs=[
+                (
+                    E,  # num of rows in the table
+                    D,  # num of columns in the table
+                    emb_location,
+                    compute_device,
+                )
+                for (E, D) in zip(Es, Ds)
+            ],
+            weights_precision=quantize_type,
+            optimizer=OptimType.EXACT_SGD,
+            learning_rate=learning_rate,
+            cache_algorithm=cache_algorithm,
+            pooling_mode=pooling_mode,
+        ).to(device)
+
+        weights = fbgemm_gpu_emb_bag.split_embedding_weights()
+        for i, emb in enumerate(weights):
+            emb.data.copy_(emb_l[i])
+
+    if not requires_grad:
+        torch.no_grad()
+        torch.set_grad_enabled(False)
+
+    return fbgemm_gpu_emb_bag
+
+
+# The purpose of this wrapper is to encapsulate the format conversions to/from fbgemm_gpu
+# so parallel_apply() executes the format-in -> fbgemm_gpu op -> format-out instructions
+# for each respective GPU in parallel.
+class fbgemm_gpu_emb_bag_wrapper(nn.Module):
+    def __init__(
+        self,
+        device,
+        emb_l,
+        m_spa,
+        quantize_bits,
+        learning_rate,
+        codegen_preference,
+        requires_grad,
+    ):
+        super(fbgemm_gpu_emb_bag_wrapper, self).__init__()
+        self.fbgemm_gpu_emb_bag = create_fbgemm_gpu_emb_bag(
+            device,
+            emb_l,
+            m_spa,
+            quantize_bits,
+            learning_rate,
+            codegen_preference,
+            requires_grad,
+        )
+        self.device = device
+        self.m_spa = m_spa
+        # create cumsum array for mixed dimension support
+        if isinstance(m_spa, list):
+            self.m_spa_cumsum = np.cumsum([0] + m_spa)
+        if not requires_grad:
+            torch.no_grad()
+            torch.set_grad_enabled(False)
+
+    def forward(self, lS_o, lS_i, v_W_l=None):
+
+        # convert offsets to fbgemm format
+        lengths_list = list(map(len, lS_i))
+        indices_lengths_cumsum = np.cumsum([0] + lengths_list)
+        if isinstance(lS_o, list):
+            lS_o = torch.stack(lS_o)
+        lS_o = lS_o.to(self.device)
+        lS_o += torch.from_numpy(indices_lengths_cumsum[:-1, np.newaxis]).to(
+            self.device
+        )
+        numel = torch.tensor([indices_lengths_cumsum[-1]], dtype=torch.long).to(
+            self.device
+        )
+        lS_o = torch.cat((lS_o.flatten(), numel))
+
+        # create per_sample_weights
+        if v_W_l:
+            per_sample_weights = torch.cat(
+                [a.gather(0, b) for a, b in zip(v_W_l, lS_i)]
+            )
+        else:
+            per_sample_weights = None
+
+        # convert indices to fbgemm_gpu format
+        if isinstance(lS_i, torch.Tensor):
+            lS_i = [lS_i]
+        lS_i = torch.cat(lS_i, dim=0).to(self.device)
+
+        if isinstance(self.fbgemm_gpu_emb_bag, IntNBitTableBatchedEmbeddingBagsCodegen):
+            lS_o = lS_o.int()
+            lS_i = lS_i.int()
+
+        # gpu embedding bag op
+        ly = self.fbgemm_gpu_emb_bag(lS_i, lS_o, per_sample_weights)
+
+        # convert the results to the next layer's input format.
+        if isinstance(self.m_spa, list):
+            # handle mixed dimensions case.
+            ly = [
+                ly[:, s:e]
+                for (s, e) in zip(self.m_spa_cumsum[:-1], self.m_spa_cumsum[1:])
+            ]
+        else:
+            # handle case in which all tables share the same column dimension.
+            cols = self.m_spa
+            ntables = len(self.fbgemm_gpu_emb_bag.embedding_specs)
+            ly = ly.reshape(-1, ntables, cols).swapaxes(0, 1)
+            ly = list(ly)
+        return ly
+
+
+### define dlrm in PyTorch ###
+class DLRM_Net(nn.Module):
+    def create_mlp(self, ln, sigmoid_layer):
+        # build MLP layer by layer
+        layers = nn.ModuleList()
+        layers.training = self.requires_grad
+        for i in range(0, ln.size - 1):
+            n = ln[i]
+            m = ln[i + 1]
+
+            # construct fully connected operator
+            LL = nn.Linear(int(n), int(m), bias=True)
+
+            # initialize the weights
+            # with torch.no_grad():
+            # custom Xavier input, output or two-sided fill
+            mean = 0.0  # std_dev = np.sqrt(variance)
+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+            bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+            # approach 1
+            LL.weight.data = torch.tensor(W)
+            LL.weight.requires_grad = self.requires_grad
+            LL.bias.data = torch.tensor(bt)
+            LL.bias.requires_grad = self.requires_grad
+            # approach 2
+            # LL.weight.data.copy_(torch.tensor(W))
+            # LL.bias.data.copy_(torch.tensor(bt))
+            # approach 3
+            # LL.weight = Parameter(torch.tensor(W),requires_grad=True)
+            # LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
+            layers.append(LL)
+
+            # construct sigmoid or relu operator
+            if i == sigmoid_layer:
+                layers.append(nn.Sigmoid())
+            else:
+                layers.append(nn.ReLU())
+
+        # approach 1: use ModuleList
+        # return layers
+        # approach 2: use Sequential container to wrap all layers
+        return torch.nn.Sequential(*layers)
+
+    def create_emb(self, m, ln, weighted_pooling=None):
+        # create_emb parameter description
+        #
+        # ln parameter:
+        # ln is a list of all the tables' row counts. E.g. [10,5,16] would mean
+        # table 0 has 10 rows, table 1 has 5 rows, and table 2 has 16 rows.
+        #
+        # m parameter (when m is a single value):
+        # m is the length of all embedding vectors. All embedding vectors in all
+        # embedding tables are created to be the same length. E.g. if ln were [3,2,5]
+        # and m were 4, table 0 would be dimension 3 x 4, table 1 would be 2 x 4,
+        # and table 2 would be 5 x 4.
+        #
+        # m parameter (when m is a list):
+        # m is a list of all the tables' column counts. E.g. if m were [4,5,6] and
+        # ln were [3,2,5], table 0 would be dimension 3 x 4, table 1 would be 2 x 5,
+        # and table 2 would be 5 x 6.
+        #
+        # Key to remember:
+        # embedding table i has shape: ln[i] rows, m columns, when m is a single value.
+        # embedding table i has shape: ln[i] rows, m[i] columns, when m is a list.
+
+        emb_l = nn.ModuleList()
+        v_W_l = []
+        for i in range(0, ln.size):
+            if ext_dist.my_size > 1:
+                if i not in self.local_emb_indices:
+                    continue
+            n = ln[i]
+
+            # construct embedding operator
+            if self.qr_flag and n > self.qr_threshold:
+                EE = QREmbeddingBag(
+                    n,
+                    m,
+                    self.qr_collisions,
+                    operation=self.qr_operation,
+                    mode="sum",
+                    sparse=True,
+                )
+            elif self.md_flag and n > self.md_threshold:
+                base = max(m)
+                _m = m[i] if n > self.md_threshold else base
+                EE = PrEmbeddingBag(n, _m, base)
+                # use np initialization as below for consistency...
+                W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m)
+                ).astype(np.float32)
+                EE.embs.weight.data = torch.tensor(W, requires_grad=self.requires_grad)
+            else:
+                EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
+                # initialize embeddings
+                # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
+                W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
+                ).astype(np.float32)
+                # approach 1
+                EE.weight.data = torch.tensor(W, requires_grad=self.requires_grad)
+                # approach 2
+                # EE.weight.data.copy_(torch.tensor(W))
+                # approach 3
+                # EE.weight = Parameter(torch.tensor(W),requires_grad=True)
+            if weighted_pooling is None:
+                v_W_l.append(None)
+            else:
+                v_W_l.append(torch.ones(n, dtype=torch.float32))
+            emb_l.append(EE)
+        return emb_l, v_W_l
+
+    def __init__(
+        self,
+        m_spa=None,
+        ln_emb=None,
+        ln_bot=None,
+        ln_top=None,
+        arch_interaction_op=None,
+        arch_interaction_itself=False,
+        sigmoid_bot=-1,
+        sigmoid_top=-1,
+        sync_dense_params=True,
+        loss_threshold=0.0,
+        ndevices=-1,
+        qr_flag=False,
+        qr_operation="mult",
+        qr_collisions=0,
+        qr_threshold=200,
+        md_flag=False,
+        md_threshold=200,
+        weighted_pooling=None,
+        loss_function="bce",
+        learning_rate=0.1,
+        use_gpu=False,
+        use_fbgemm_gpu=False,
+        fbgemm_gpu_codegen_pref="Split",
+        inference_only=False,
+        quantize_mlp_with_bit=False,
+        quantize_emb_with_bit=False,
+    ):
+        super(DLRM_Net, self).__init__()
+
+        if (
+            (m_spa is not None)
+            and (ln_emb is not None)
+            and (ln_bot is not None)
+            and (ln_top is not None)
+            and (arch_interaction_op is not None)
+        ):
+            # save arguments
+            self.ntables = len(ln_emb)
+            self.m_spa = m_spa
+            self.use_gpu = use_gpu
+            self.use_fbgemm_gpu = use_fbgemm_gpu
+            self.fbgemm_gpu_codegen_pref = fbgemm_gpu_codegen_pref
+            self.requires_grad = not inference_only
+            self.ndevices_available = ndevices
+            self.ndevices_in_use = ndevices
+            self.output_d = 0
+            self.add_new_weights_to_params = False
+            self.arch_interaction_op = arch_interaction_op
+            self.arch_interaction_itself = arch_interaction_itself
+            self.sync_dense_params = sync_dense_params and not inference_only
+            self.loss_threshold = loss_threshold
+            self.loss_function = loss_function
+            self.learning_rate = learning_rate
+            if weighted_pooling is not None and weighted_pooling != "fixed":
+                self.weighted_pooling = "learned"
+            else:
+                self.weighted_pooling = weighted_pooling
+            # create variables for QR embedding if applicable
+            self.qr_flag = qr_flag
+            if self.qr_flag:
+                self.qr_collisions = qr_collisions
+                self.qr_operation = qr_operation
+                self.qr_threshold = qr_threshold
+            # create variables for MD embedding if applicable
+            self.md_flag = md_flag
+            if self.md_flag:
+                self.md_threshold = md_threshold
+
+            # If running distributed, get local slice of embedding tables
+            if ext_dist.my_size > 1:
+                n_emb = len(ln_emb)
+                if n_emb < ext_dist.my_size:
+                    sys.exit(
+                        "only (%d) sparse features for (%d) devices, table partitions will fail"
+                        % (n_emb, ext_dist.my_size)
+                    )
+                self.n_global_emb = n_emb
+                self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(
+                    n_emb
+                )
+                self.local_emb_slice = ext_dist.get_my_slice(n_emb)
+                self.local_emb_indices = list(range(n_emb))[self.local_emb_slice]
+
+            # create operators
+            self.emb_l, self.v_W_l = self.create_emb(m_spa, ln_emb, weighted_pooling)
+            if self.weighted_pooling == "learned":
+                self.v_W_l = nn.ParameterList(list(map(Parameter, self.v_W_l)))
+
+            self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
+            self.top_l = self.create_mlp(ln_top, sigmoid_top)
+
+            # quantization
+            self.quantize_emb = False
+            self.emb_l_q = []
+            self.quantize_bits = 32
+
+            # fbgemm_gpu
+            self.fbgemm_emb_l = []
+            self.v_W_l_l = [self.v_W_l] if self.weighted_pooling else [None]
+
+            self.interact_features_l = []
+
+            # specify the loss function
+            if self.loss_function == "mse":
+                self.loss_fn = torch.nn.MSELoss(reduction="mean")
+            elif self.loss_function == "bce":
+                self.loss_fn = torch.nn.BCELoss(reduction="mean")
+            elif self.loss_function == "wbce":
+                self.loss_ws = torch.tensor(
+                    np.fromstring(args.loss_weights, dtype=float, sep="-")
+                )
+                self.loss_fn = torch.nn.BCELoss(reduction="none")
+            else:
+                sys.exit(
+                    "ERROR: --loss-function=" + self.loss_function + " is not supported"
+                )
+
+    def prepare_parallel_model(self, ndevices):
+        device_ids = range(ndevices)
+        # replicate mlp (data parallelism)
+        self.bot_l_replicas = replicate(self.bot_l, device_ids)
+        self.top_l_replicas = replicate(self.top_l, device_ids)
+
+        # distribute embeddings (model parallelism)
+        if self.weighted_pooling is not None:
+            for k, w in enumerate(self.v_W_l):
+                self.v_W_l[k] = Parameter(
+                    w.to(torch.device("cuda:" + str(k % ndevices)))
+                )
+        if not self.use_fbgemm_gpu:
+            for k, w in enumerate(self.emb_l):
+                self.emb_l[k] = w.to(torch.device("cuda:" + str(k % ndevices)))
+        else:
+            self.fbgemm_emb_l, self.v_W_l_l = zip(
+                *[
+                    (
+                        fbgemm_gpu_emb_bag_wrapper(
+                            torch.device("cuda:" + str(k)),
+                            self.emb_l[k::ndevices]
+                            if self.emb_l
+                            else self.emb_l_q[k::ndevices],
+                            self.m_spa[k::ndevices]
+                            if isinstance(self.m_spa, list)
+                            else self.m_spa,
+                            self.quantize_bits,
+                            self.learning_rate,
+                            self.fbgemm_gpu_codegen_pref,
+                            self.requires_grad,
+                        ),
+                        self.v_W_l[k::ndevices] if self.weighted_pooling else None,
+                    )
+                    for k in range(ndevices)
+                ]
+            )
+            self.add_new_weights_to_params = True
+        self.interact_features_l = [self.nn_module_wrapper() for _ in range(ndevices)]
+
+    # nn_module_wrapper is used to call functions concurrently across multi-gpus, using parallel_apply,
+    # which requires an nn.Module subclass.
+    class nn_module_wrapper(nn.Module):
+        def __init__(self):
+            super(DLRM_Net.nn_module_wrapper, self).__init__()
+        def forward(self, E, x, ly):
+            return E(x, ly)
+
+    def apply_mlp(self, x, layers):
+        # approach 1: use ModuleList
+        # for layer in layers:
+        #     x = layer(x)
+        # return x
+        # approach 2: use Sequential container to wrap all layers
+        return layers(x)
+
+    def apply_emb(self, lS_o, lS_i):
+        # WARNING: notice that we are processing the batch at once. We implicitly
+        # assume that the data is laid out such that:
+        # 1. each embedding is indexed with a group of sparse indices,
+        #   corresponding to a single lookup
+        # 2. for each embedding the lookups are further organized into a batch
+        # 3. for a list of embedding tables there is a list of batched lookups
+
+        if self.use_fbgemm_gpu:
+            # Deinterleave and reshape to 2d, so items are grouped by device
+            # per row. Then parallel apply.
+            ndevices = len(self.fbgemm_emb_l)
+            lS_o_l = [lS_o[k::ndevices] for k in range(ndevices)]
+            lS_i_l = [lS_i[k::ndevices] for k in range(ndevices)]
+            ly = parallel_apply(
+                self.fbgemm_emb_l, list(zip(lS_o_l, lS_i_l, self.v_W_l_l))
+            )
+            # Interleave and flatten to match non-fbgemm_gpu ly format.
+            ly = [ly[i % ndevices][i // ndevices] for i in range(self.ntables)]
+        else:
+            ly = []
+            for k, sparse_index_group_batch in enumerate(lS_i):
+                sparse_offset_group_batch = lS_o[k]
+
+                # embedding lookup
+                # We are using EmbeddingBag, which implicitly uses sum operator.
+                # The embeddings are represented as tall matrices, with sum
+                # happening vertically across 0 axis, resulting in a row vector
+                # E = emb_l[k]
+
+                if self.v_W_l[k] is not None:
+                    per_sample_weights = self.v_W_l[k].gather(
+                        0, sparse_index_group_batch
+                    )
+                else:
+                    per_sample_weights = None
+
+                if self.quantize_emb:
+                    if self.quantize_bits == 4:
+                        E = ops.quantized.embedding_bag_4bit_rowwise_offsets
+                    elif self.quantize_bits == 8:
+                        E = ops.quantized.embedding_bag_byte_rowwise_offsets
+                    QV = E(
+                        self.emb_l_q[k],
+                        sparse_index_group_batch,
+                        sparse_offset_group_batch,
+                        per_sample_weights=per_sample_weights,
+                    )
+
+                    ly.append(QV)
+                else:
+                    E = self.emb_l[k]
+                    V = E(
+                        sparse_index_group_batch,
+                        sparse_offset_group_batch,
+                        per_sample_weights=per_sample_weights,
+                    )
+
+                    ly.append(V)
+
+        # print(ly)
+        return ly
+
+    #  using quantizing functions from caffe2/aten/src/ATen/native/quantized/cpu
+    def quantize_embedding(self, bits):
+
+        n = len(self.emb_l)
+        self.emb_l_q = [None] * n
+        for k in range(n):
+            if bits == 4:
+                self.emb_l_q[k] = ops.quantized.embedding_bag_4bit_prepack(
+                    self.emb_l[k].weight
+                )
+            elif bits == 8:
+                self.emb_l_q[k] = ops.quantized.embedding_bag_byte_prepack(
+                    self.emb_l[k].weight
+                )
+            elif bits == 16:
+                self.emb_l_q[k] = self.emb_l[k].half().weight
+            else:
+                return
+        self.emb_l = None
+        self.quantize_emb = True
+        self.quantize_bits = bits
+
+    def interact_features(self, x, ly):
+
+        if self.arch_interaction_op == "dot":
+            # concatenate dense and sparse features
+            (batch_size, d) = x.shape
+            T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
+            # perform a dot product
+            Z = torch.bmm(T, torch.transpose(T, 1, 2))
+            # append dense feature with the interactions (into a row vector)
+            # approach 1: all
+            # Zflat = Z.view((batch_size, -1))
+            # approach 2: unique
+            _, ni, nj = Z.shape
+            # approach 1: tril_indices
+            # offset = 0 if self.arch_interaction_itself else -1
+            # li, lj = torch.tril_indices(ni, nj, offset=offset)
+            # approach 2: custom
+            offset = 1 if self.arch_interaction_itself else 0
+            li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
+            lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
+            Zflat = Z[:, li, lj]
+            # concatenate dense features and interactions
+            R = torch.cat([x] + [Zflat], dim=1)
+        elif self.arch_interaction_op == "cat":
+            # concatenation features (into a row vector)
+            R = torch.cat([x] + ly, dim=1)
+        else:
+            sys.exit(
+                "ERROR: --arch-interaction-op="
+                + self.arch_interaction_op
+                + " is not supported"
+            )
+
+        return R
+
+    def forward(self, dense_x, lS_o, lS_i):
+        if ext_dist.my_size > 1:
+            # multi-node multi-device run
+            return self.distributed_forward(dense_x, lS_o, lS_i)
+        elif self.ndevices_available <= 1:
+            # single device run
+            return self.sequential_forward(dense_x, lS_o, lS_i)
+        else:
+            # single-node multi-device run
+            return self.parallel_forward(dense_x, lS_o, lS_i)
+
+    def distributed_forward(self, dense_x, lS_o, lS_i):
+        batch_size = dense_x.size()[0]
+        # WARNING: # of ranks must be <= batch size in distributed_forward call
+        if batch_size < ext_dist.my_size:
+            sys.exit(
+                "ERROR: batch_size (%d) must be larger than number of ranks (%d)"
+                % (batch_size, ext_dist.my_size)
+            )
+        if batch_size % ext_dist.my_size != 0:
+            sys.exit(
+                "ERROR: batch_size %d can not split across %d ranks evenly"
+                % (batch_size, ext_dist.my_size)
+            )
+
+        dense_x = dense_x[ext_dist.get_my_slice(batch_size)]
+        lS_o = lS_o[self.local_emb_slice]
+        lS_i = lS_i[self.local_emb_slice]
+
+        if (self.ntables != len(lS_o)) or (self.ntables != len(lS_i)):
+            sys.exit(
+                "ERROR: corrupted model input detected in distributed_forward call"
+            )
+
+        # embeddings
+        with record_function("DLRM embedding forward"):
+            ly = self.apply_emb(lS_o, lS_i)
+
+        # WARNING: Note that at this point we have the result of the embedding lookup
+        # for the entire batch on each rank. We would like to obtain partial results
+        # corresponding to all embedding lookups, but part of the batch on each rank.
+        # Therefore, matching the distribution of output of bottom mlp, so that both
+        # could be used for subsequent interactions on each device.
+        if self.ntables != len(ly):
+            sys.exit("ERROR: corrupted intermediate result in distributed_forward call")
+
+        a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank)
+
+        with record_function("DLRM bottom nlp forward"):
+            x = self.apply_mlp(dense_x, self.bot_l)
+
+        ly = a2a_req.wait()
+        ly = list(ly)
+
+        # interactions
+        with record_function("DLRM interaction forward"):
+            z = self.interact_features(x, ly)
+
+        # top mlp
+        with record_function("DLRM top nlp forward"):
+            p = self.apply_mlp(z, self.top_l)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
+        else:
+            z = p
+
+        return z
+
+    def sequential_forward(self, dense_x, lS_o, lS_i):
+        # process dense features (using bottom mlp), resulting in a row vector
+        x = self.apply_mlp(dense_x, self.bot_l)
+        # debug prints
+        # print("intermediate")
+        # print(x.detach().cpu().numpy())
+
+        # process sparse features(using embeddings), resulting in a list of row vectors
+        ly = self.apply_emb(lS_o, lS_i)
+        # for y in ly:
+        #     print(y.detach().cpu().numpy())
+
+        # interact features (dense and sparse)
+        z = self.interact_features(x, ly)
+        # print(z.detach().cpu().numpy())
+
+        # obtain probability of a click (using top mlp)
+        p = self.apply_mlp(z, self.top_l)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
+        else:
+            z = p
+
+        return z
+
+    def parallel_forward(self, dense_x, lS_o, lS_i):
+        ### prepare model (overwrite) ###
+        # WARNING: # of devices must be >= batch size in parallel_forward call
+        batch_size = dense_x.size()[0]
+        ndevices = min(self.ndevices_available, batch_size, self.ntables)
+        device_ids = range(ndevices)
+        # WARNING: must redistribute the model if mini-batch size changes(this is common
+        # for last mini-batch, when # of elements in the dataset/batch size is not even
+        if self.ndevices_in_use != ndevices:
+            self.ndevices_in_use = ndevices
+            self.prepare_parallel_model(ndevices)
+        elif self.sync_dense_params:
+            # When training, replicate the new/updated mlp weights each iteration.
+            # For inference-only, this code should never run.
+            self.bot_l_replicas = replicate(self.bot_l, device_ids)
+            self.top_l_replicas = replicate(self.top_l, device_ids)
+
+        ### prepare input (overwrite) ###
+        # scatter dense features (data parallelism)
+        # print(dense_x.device)
+        dense_x = scatter(dense_x, device_ids, dim=0)
+        # distribute sparse features (model parallelism)
+        if (self.ntables != len(lS_o)) or (self.ntables != len(lS_i)):
+            sys.exit("ERROR: corrupted model input detected in parallel_forward call")
+
+        lS_o = [
+            lS_o[k].to(torch.device("cuda:" + str(k % ndevices)))
+            for k in range(self.ntables)
+        ]
+        lS_i = [
+            lS_i[k].to(torch.device("cuda:" + str(k % ndevices)))
+            for k in range(self.ntables)
+        ]
+
+        ### compute results in parallel ###
+        # bottom mlp
+        # WARNING: Note that the self.bot_l is a list of bottom mlp modules
+        # that have been replicated across devices, while dense_x is a tuple of dense
+        # inputs that has been scattered across devices on the first (batch) dimension.
+        # The output is a list of tensors scattered across devices according to the
+        # distribution of dense_x.
+        x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids)
+        # debug prints
+        # print(x)
+
+        # embeddings
+        ly = self.apply_emb(lS_o, lS_i)
+        # debug prints
+        # print(ly)
+
+        # butterfly shuffle (implemented inefficiently for now)
+        # WARNING: Note that at this point we have the result of the embedding lookup
+        # for the entire batch on each device. We would like to obtain partial results
+        # corresponding to all embedding lookups, but part of the batch on each device.
+        # Therefore, matching the distribution of output of bottom mlp, so that both
+        # could be used for subsequent interactions on each device.
+        if self.ntables != len(ly):
+            sys.exit("ERROR: corrupted intermediate result in parallel_forward call")
+
+        t_list = [scatter(ly[k], device_ids, dim=0) for k in range(self.ntables)]
+
+        # adjust the list to be ordered per device
+        ly = list(map(lambda y: list(y), zip(*t_list)))
+        # debug prints
+        # print(ly)
+
+        # interactions
+        z = parallel_apply(self.interact_features_l, list(zip(itertools.repeat(self.interact_features),x,ly)))
+        # debug prints
+        # print(z)
+
+        # top mlp
+        # WARNING: Note that the self.top_l is a list of top mlp modules that
+        # have been replicated across devices, while z is a list of interaction results
+        # that by construction are scattered across devices on the first (batch) dim.
+        # The output is a list of tensors scattered across devices according to the
+        # distribution of z.
+        p = parallel_apply(self.top_l_replicas, z, None, device_ids)
+
+        ### gather the distributed results ###
+        p0 = gather(p, self.output_d, dim=0)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z0 = torch.clamp(
+                p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
+            )
+        else:
+            z0 = p0
+
+        return z0
+
+    def print_weights(self):
+        if self.use_fbgemm_gpu and len(self.fbgemm_emb_l):
+            ntables_l = [
+                len(e.fbgemm_gpu_emb_bag.embedding_specs) for e in self.fbgemm_emb_l
+            ]
+            for j in range(ntables_l[0] + 1):
+                for k, e in enumerate(self.fbgemm_emb_l):
+                    if j < ntables_l[k]:
+                        print(
+                            e.fbgemm_gpu_emb_bag.split_embedding_weights()[j]
+                            .detach()
+                            .cpu()
+                            .numpy()
+                        )
+        elif self.quantize_bits != 32:
+            for e in self.emb_l_q:
+                print(e.data.detach().cpu().numpy())
+        else:  # if self.emb_l:
+            for param in self.emb_l.parameters():
+                print(param.detach().cpu().numpy())
+        if isinstance(self.v_W_l, nn.ParameterList):
+            for param in self.v_W_l.parameters():
+                print(param.detach().cpu().numpy())
+        for param in self.bot_l.parameters():
+            print(param.detach().cpu().numpy())
+        for param in self.top_l.parameters():
+            print(param.detach().cpu().numpy())
+
+
+def dash_separated_ints(value):
+    vals = value.split("-")
+    for val in vals:
+        try:
+            int(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of ints" % value
+            )
+
+    return value
+
+
+def dash_separated_floats(value):
+    vals = value.split("-")
+    for val in vals:
+        try:
+            float(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of floats" % value
+            )
+
+    return value
+
+
+def inference(
+    args,
+    dlrm,
+    best_acc_test,
+    best_auc_test,
+    test_ld,
+    device,
+    use_gpu,
+    log_iter=-1,
+):
+    test_accu = 0
+    test_samp = 0
+
+    if args.mlperf_logging:
+        scores = []
+        targets = []
+
+    if args.fb5logger is not None:
+        fb5logger = FB5Logger(args.fb5logger)
+        fb5logger.header("DLRM", "OOTB", "eval", args.fb5config, score_metric=loggerconstants.EXPS)
+
+    for i, testBatch in enumerate(test_ld):
+        # early exit if nbatches was set by the user and was exceeded
+        if nbatches > 0 and i >= nbatches:
+            break
+
+        if i == args.warmup_steps and args.fb5logger is not None:
+            fb5logger.run_start()
+
+        X_test, lS_o_test, lS_i_test, T_test, W_test, CBPP_test = unpack_batch(
+            testBatch
+        )
+
+        # Skip the batch if batch size not multiple of total ranks
+        if ext_dist.my_size > 1 and X_test.size(0) % ext_dist.my_size != 0:
+            print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0)))
+            continue
+
+        # forward pass
+        Z_test = dlrm_wrap(
+            X_test,
+            lS_o_test,
+            lS_i_test,
+            use_gpu,
+            device,
+            ndevices=ndevices,
+        )
+        ### gather the distributed results on each rank ###
+        # For some reason it requires explicit sync before all_gather call if
+        # tensor is on GPU memory
+        if Z_test.is_cuda:
+            torch.cuda.synchronize()
+        (_, batch_split_lengths) = ext_dist.get_split_lengths(X_test.size(0))
+        if ext_dist.my_size > 1:
+            Z_test = ext_dist.all_gather(Z_test, batch_split_lengths)
+
+        if args.mlperf_logging:
+            S_test = Z_test.detach().cpu().numpy()  # numpy array
+            T_test = T_test.detach().cpu().numpy()  # numpy array
+            scores.append(S_test)
+            targets.append(T_test)
+        else:
+            with record_function("DLRM accuracy compute"):
+                # compute loss and accuracy
+                S_test = Z_test.detach().cpu().numpy()  # numpy array
+                T_test = T_test.detach().cpu().numpy()  # numpy array
+
+                mbs_test = T_test.shape[0]  # = mini_batch_size except last
+                A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8))
+
+                test_accu += A_test
+                test_samp += mbs_test
+
+    if args.fb5logger is not None:
+        fb5logger.run_stop(nbatches - args.warmup_steps, args.mini_batch_size)
+
+    if args.mlperf_logging:
+        with record_function("DLRM mlperf sklearn metrics compute"):
+            scores = np.concatenate(scores, axis=0)
+            targets = np.concatenate(targets, axis=0)
+
+            metrics = {
+                "recall": lambda y_true, y_score: sklearn.metrics.recall_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "precision": lambda y_true, y_score: sklearn.metrics.precision_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "f1": lambda y_true, y_score: sklearn.metrics.f1_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+                "ap": sklearn.metrics.average_precision_score,
+                "roc_auc": sklearn.metrics.roc_auc_score,
+                "accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score(
+                    y_true=y_true, y_pred=np.round(y_score)
+                ),
+            }
+
+        validation_results = {}
+        for metric_name, metric_function in metrics.items():
+            validation_results[metric_name] = metric_function(targets, scores)
+            writer.add_scalar(
+                "mlperf-metrics-test/" + metric_name,
+                validation_results[metric_name],
+                log_iter,
+            )
+        acc_test = validation_results["accuracy"]
+    else:
+        acc_test = test_accu / test_samp
+        writer.add_scalar("Test/Acc", acc_test, log_iter)
+
+    model_metrics_dict = {
+        "nepochs": args.nepochs,
+        "nbatches": nbatches,
+        "nbatches_test": nbatches_test,
+        "state_dict": dlrm.state_dict(),
+        "test_acc": acc_test,
+    }
+
+    if args.mlperf_logging:
+        is_best = validation_results["roc_auc"] > best_auc_test
+        if is_best:
+            best_auc_test = validation_results["roc_auc"]
+            model_metrics_dict["test_auc"] = best_auc_test
+        print(
+            "recall {:.4f}, precision {:.4f},".format(
+                validation_results["recall"],
+                validation_results["precision"],
+            )
+            + " f1 {:.4f}, ap {:.4f},".format(
+                validation_results["f1"], validation_results["ap"]
+            )
+            + " auc {:.4f}, best auc {:.4f},".format(
+                validation_results["roc_auc"], best_auc_test
+            )
+            + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
+                validation_results["accuracy"] * 100, best_acc_test * 100
+            ),
+            flush=True,
+        )
+    else:
+        is_best = acc_test > best_acc_test
+        if is_best:
+            best_acc_test = acc_test
+        print(
+            " accuracy {:3.3f} %, best {:3.3f} %".format(
+                acc_test * 100, best_acc_test * 100
+            ),
+            flush=True,
+        )
+    return model_metrics_dict, is_best
+
+
+def run():
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Train Deep Learning Recommendation Model (DLRM)"
+    )
+    # model related parameters
+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
+    parser.add_argument(
+        "--arch-embedding-size", type=dash_separated_ints, default="4-3-2"
+    )
+    # j will be replaced with the table number
+    parser.add_argument("--arch-mlp-bot", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument("--arch-mlp-top", type=dash_separated_ints, default="4-2-1")
+    parser.add_argument(
+        "--arch-interaction-op", type=str, choices=["dot", "cat"], default="dot"
+    )
+    parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
+    parser.add_argument(
+        "--weighted-pooling", type=str, choices=["fixed", "learned", None], default=None
+    )
+
+    # embedding table options
+    parser.add_argument("--md-flag", action="store_true", default=False)
+    parser.add_argument("--md-threshold", type=int, default=200)
+    parser.add_argument("--md-temperature", type=float, default=0.3)
+    parser.add_argument("--md-round-dims", action="store_true", default=False)
+    parser.add_argument("--qr-flag", action="store_true", default=False)
+    parser.add_argument("--qr-threshold", type=int, default=200)
+    parser.add_argument("--qr-operation", type=str, default="mult")
+    parser.add_argument("--qr-collisions", type=int, default=4)
+    # activations and loss
+    parser.add_argument("--activation-function", type=str, default="relu")
+    parser.add_argument("--loss-function", type=str, default="mse")  # or bce or wbce
+    parser.add_argument(
+        "--loss-weights", type=dash_separated_floats, default="1.0-1.0"
+    )  # for wbce
+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
+    parser.add_argument("--round-targets", type=bool, default=False)
+    # data
+    parser.add_argument("--data-size", type=int, default=1)
+    parser.add_argument("--num-batches", type=int, default=0)
+    parser.add_argument(
+        "--data-generation", type=str, default="random"
+    )  # synthetic or dataset
+    parser.add_argument(
+        "--rand-data-dist", type=str, default="uniform"
+    )  # uniform or gaussian
+    parser.add_argument("--rand-data-min", type=float, default=0)
+    parser.add_argument("--rand-data-max", type=float, default=1)
+    parser.add_argument("--rand-data-mu", type=float, default=-1)
+    parser.add_argument("--rand-data-sigma", type=float, default=1)
+    parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--num-indices-per-lookup", type=int, default=10)
+    parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    # training
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--nepochs", type=int, default=1)
+    parser.add_argument("--learning-rate", type=float, default=0.01)
+    parser.add_argument("--print-precision", type=int, default=5)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--sync-dense-params", type=bool, default=True)
+    parser.add_argument("--optimizer", type=str, default="sgd")
+    parser.add_argument(
+        "--dataset-multiprocessing",
+        action="store_true",
+        default=False,
+        help="The Kaggle dataset can be multiprocessed in an environment \
+                        with more than 7 CPU cores and more than 20 GB of memory. \n \
+                        The Terabyte dataset can be multiprocessed in an environment \
+                        with more than 24 CPU cores and at least 1 TB of memory.",
+    )
+    # inference
+    parser.add_argument("--inference-only", action="store_true", default=False)
+    # quantize
+    parser.add_argument("--quantize-mlp-with-bit", type=int, default=32)
+    parser.add_argument("--quantize-emb-with-bit", type=int, default=32)
+    # onnx
+    parser.add_argument("--save-onnx", action="store_true", default=False)
+    # gpu
+    parser.add_argument("--use-gpu", action="store_true", default=False)
+    parser.add_argument("--use-fbgemm-gpu", action="store_true", default=False)
+    parser.add_argument(
+        "--fbgemm-gpu-codegen-pref",
+        type=str,
+        choices=["Split", "IntN"],
+        default="Split",
+    )
+    # torch2trt
+    parser.add_argument("--use-torch2trt-for-mlp", action="store_true", default=False)
+    # distributed
+    parser.add_argument("--local_rank", type=int, default=-1)
+    parser.add_argument("--dist-backend", type=str, default="")
+    # debugging and profiling
+    parser.add_argument("--print-freq", type=int, default=1)
+    parser.add_argument("--test-freq", type=int, default=-1)
+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
+    parser.add_argument("--test-num-workers", type=int, default=-1)
+    parser.add_argument("--print-time", action="store_true", default=False)
+    parser.add_argument("--print-wall-time", action="store_true", default=False)
+    parser.add_argument("--print-accumulated-time", action="store_true", default=False)
+    parser.add_argument("--debug-mode", action="store_true", default=False)
+    parser.add_argument("--enable-profiling", action="store_true", default=False)
+    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
+    parser.add_argument("--tensor-board-filename", type=str, default="run_kaggle_pt")
+    # store/load model
+    parser.add_argument("--save-model", type=str, default="")
+    parser.add_argument("--load-model", type=str, default="")
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+    # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
+    parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
+    # stop at target AUC Terabyte (no subsampling) 0.8025
+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
+    parser.add_argument("--mlperf-bin-loader", action="store_true", default=False)
+    parser.add_argument("--mlperf-bin-shuffle", action="store_true", default=False)
+    # mlperf gradient accumulation iterations
+    parser.add_argument("--mlperf-grad-accum-iter", type=int, default=1)
+    # LR policy
+    parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
+    parser.add_argument("--lr-decay-start-step", type=int, default=0)
+    parser.add_argument("--lr-num-decay-steps", type=int, default=0)
+
+    parser.add_argument("--precache-ml-data", type=int, nargs='?', default=None, const=sys.maxsize)
+    parser.add_argument("--warmup-steps", type=int, default=0)
+    # FB5 Logging
+    parser.add_argument("--fb5logger", type=str, default=None)
+    parser.add_argument("--fb5config", type=str, default="tiny")
+
+    global args
+    global nbatches
+    global nbatches_test
+    global writer
+    args = parser.parse_args()
+
+    if args.dataset_multiprocessing:
+        assert float(sys.version[:3]) > 3.7, (
+            "The dataset_multiprocessing "
+            + "flag is susceptible to a bug in Python 3.7 and under. "
+            + "https://github.com/facebookresearch/dlrm/issues/172"
+        )
+
+    if args.mlperf_logging:
+        mlperf_logger.log_event(key=mlperf_logger.constants.CACHE_CLEAR, value=True)
+        mlperf_logger.log_start(
+            key=mlperf_logger.constants.INIT_START, log_all_ranks=True
+        )
+
+    if args.weighted_pooling is not None:
+        if args.qr_flag:
+            sys.exit("ERROR: quotient remainder with weighted pooling is not supported")
+        if args.md_flag:
+            sys.exit("ERROR: mixed dimensions with weighted pooling is not supported")
+    if args.quantize_emb_with_bit in [4, 8]:
+        if args.qr_flag:
+            sys.exit(
+                "ERROR: 4 and 8-bit quantization with quotient remainder is not supported"
+            )
+        if args.md_flag:
+            sys.exit(
+                "ERROR: 4 and 8-bit quantization with mixed dimensions is not supported"
+            )
+    if args.quantize_emb_with_bit in [4, 8, 16] and (
+        not fbgemm_gpu or not args.use_fbgemm_gpu
+    ):
+        extra_info = ""
+        if not fbgemm_gpu:
+            extra_info += "\nfbgemm_gpu module failed to import.\n\n" + fbgemm_gpu_import_error_msg
+        if not args.use_fbgemm_gpu:
+            extra_info += "--use-fbgemm-gpu not set. "
+
+        if not args.inference_only:
+            sys.exit(
+                "ERROR: Training quantized embeddings requires fbgemm_gpu. "
+                + extra_info
+            )
+        elif args.use_gpu:
+            sys.exit(
+                "ERROR: Quantized embeddings on GPU requires fbgemm_gpu. " + extra_info
+            )
+        elif args.quantize_emb_with_bit == 16:
+            sys.exit(
+                "ERROR: 16-bit quantized embeddings requires fbgemm_gpu. " + extra_info
+            )
+
+    assert args.quantize_emb_with_bit in [
+        4,
+        8,
+        16,
+        32,
+    ], "only support 4/8/16/32-bit but got {}".format(args.quantize_emb_with_bit)
+
+    if args.use_gpu:
+        assert torch.cuda.is_available(), "No cuda device is available."
+    if args.use_fbgemm_gpu:
+        assert fbgemm_gpu, ("\nfbgemm_gpu module failed to import.\n\n" + fbgemm_gpu_import_error_msg)
+    use_gpu = args.use_gpu
+    use_fbgemm_gpu = args.use_fbgemm_gpu
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+    torch.set_printoptions(precision=args.print_precision)
+    torch.manual_seed(args.numpy_rand_seed)
+
+    if args.test_mini_batch_size < 0:
+        # if the parameter is not set, use the training batch size
+        args.test_mini_batch_size = args.mini_batch_size
+    if args.test_num_workers < 0:
+        # if the parameter is not set, use the same parameter for training
+        args.test_num_workers = args.num_workers
+
+    if not args.debug_mode:
+        ext_dist.init_distributed(
+            local_rank=args.local_rank, use_gpu=use_gpu, backend=args.dist_backend
+        )
+
+    if use_gpu:
+        torch.cuda.manual_seed_all(args.numpy_rand_seed)
+        torch.backends.cudnn.deterministic = True
+        if ext_dist.my_size > 1:
+            ngpus = 1
+            device = torch.device("cuda", ext_dist.my_local_rank)
+        else:
+            ngpus = torch.cuda.device_count()
+            device = torch.device("cuda", 0)
+        print("Using {} GPU(s)...".format(ngpus))
+    else:
+        device = torch.device("cpu")
+        print("Using CPU...")
+
+    ### prepare training data ###
+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
+    # input data
+
+    if args.mlperf_logging:
+        mlperf_logger.barrier()
+        mlperf_logger.log_end(key=mlperf_logger.constants.INIT_STOP)
+        mlperf_logger.barrier()
+        mlperf_logger.log_start(key=mlperf_logger.constants.RUN_START)
+        mlperf_logger.barrier()
+
+    if args.data_generation == "dataset":
+        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
+        table_feature_map = {idx: idx for idx in range(len(train_data.counts))}
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        nbatches_test = len(test_ld)
+
+        ln_emb = train_data.counts
+        # enforce maximum limit on number of vectors per embedding
+        if args.max_ind_range > 0:
+            ln_emb = np.array(
+                list(
+                    map(
+                        lambda x: x if x < args.max_ind_range else args.max_ind_range,
+                        ln_emb,
+                    )
+                )
+            )
+        else:
+            ln_emb = np.array(ln_emb)
+        m_den = train_data.m_den
+        ln_bot[0] = m_den
+    else:
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld, test_data, test_ld = dp.make_random_data_and_loader(
+            args, ln_emb, m_den, cache_size=args.precache_ml_data
+        )
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        nbatches_test = len(test_ld)
+
+    assert args.num_batches > args.warmup_steps, (f"Change --warmup-steps={args.warmup_steps} to be lower than --num-batches={args.num_batches}.")
+
+    args.ln_emb = ln_emb.tolist()
+    if args.mlperf_logging:
+        print("command line args: ", json.dumps(vars(args)))
+
+    ### parse command line arguments ###
+    m_spa = args.arch_sparse_feature_size
+    ln_emb = np.asarray(ln_emb)
+    num_fea = ln_emb.size + 1  # num sparse + num dense features
+
+    if args.use_fbgemm_gpu:
+        assert m_spa % 4 == 0, (
+            f"{m_spa} % 4 is not 0, but fbgemm_gpu requires the embedding dim "
+            + "(--arch-sparse-feature-size number) to be evenly divisible by 4."
+        )
+
+    m_den_out = ln_bot[ln_bot.size - 1]
+    if args.arch_interaction_op == "dot":
+        # approach 1: all
+        # num_int = num_fea * num_fea + m_den_out
+        # approach 2: unique
+        if args.arch_interaction_itself:
+            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
+        else:
+            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
+    elif args.arch_interaction_op == "cat":
+        num_int = num_fea * m_den_out
+    else:
+        sys.exit(
+            "ERROR: --arch-interaction-op="
+            + args.arch_interaction_op
+            + " is not supported"
+        )
+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
+
+    # sanity check: feature sizes and mlp dimensions must match
+    if m_den != ln_bot[0]:
+        sys.exit(
+            "ERROR: arch-dense-feature-size "
+            + str(m_den)
+            + " does not match first dim of bottom mlp "
+            + str(ln_bot[0])
+        )
+    if args.qr_flag:
+        if args.qr_operation == "concat" and 2 * m_spa != m_den_out:
+            sys.exit(
+                "ERROR: 2 arch-sparse-feature-size "
+                + str(2 * m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+                + " (note that the last dim of bottom mlp must be 2x the embedding dim)"
+            )
+        if args.qr_operation != "concat" and m_spa != m_den_out:
+            sys.exit(
+                "ERROR: arch-sparse-feature-size "
+                + str(m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+            )
+    else:
+        if m_spa != m_den_out:
+            sys.exit(
+                "ERROR: arch-sparse-feature-size "
+                + str(m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+            )
+    if num_int != ln_top[0]:
+        sys.exit(
+            "ERROR: # of feature interactions "
+            + str(num_int)
+            + " does not match first dimension of top mlp "
+            + str(ln_top[0])
+        )
+
+    # assign mixed dimensions if applicable
+    if args.md_flag:
+        m_spa = md_solver(
+            torch.tensor(ln_emb),
+            args.md_temperature,  # alpha
+            d0=m_spa,
+            round_dim=args.md_round_dims,
+        ).tolist()
+        if use_fbgemm_gpu:
+            for m in m_spa:
+                assert m % 4 == 0, (
+                    "Found an incompatible embedding dim in m_spa. "
+                    + f"{m} % 4 is not 0, but fbgemm_gpu requires the "
+                    + "embedding dim to be evenly divisible by 4."
+                )
+
+    # test prints (model arch)
+    if args.debug_mode:
+        print("model arch:")
+        print(
+            "mlp top arch "
+            + str(ln_top.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_top)
+        print("# of interactions")
+        print(num_int)
+        print(
+            "mlp bot arch "
+            + str(ln_bot.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_bot)
+        print("# of features (sparse and dense)")
+        print(num_fea)
+        print("dense feature size")
+        print(m_den)
+        print("sparse feature size")
+        print(m_spa)
+        print(
+            "# of embeddings (= # of sparse features) "
+            + str(ln_emb.size)
+            + ", with dimensions "
+            + str(m_spa)
+            + "x:"
+        )
+        print(ln_emb)
+
+        print("data (inputs and targets):")
+        for j, inputBatch in enumerate(train_ld):
+            X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
+
+            torch.set_printoptions(precision=4)
+            # early exit if nbatches was set by the user and has been exceeded
+            if nbatches > 0 and j >= nbatches:
+                break
+            print("mini-batch: %d" % j)
+            print(X.detach().cpu())
+            # transform offsets to lengths when printing
+            print(
+                torch.IntTensor(
+                    [
+                        np.diff(
+                            S_o.detach().cpu().tolist() + list(lS_i[i].shape)
+                        ).tolist()
+                        for i, S_o in enumerate(lS_o)
+                    ]
+                )
+            )
+            print([S_i.detach().cpu() for S_i in lS_i])
+            print(T.detach().cpu())
+
+    global ndevices
+    ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
+
+    ### construct the neural network specified above ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    # np.random.seed(args.numpy_rand_seed)
+    global dlrm
+    dlrm = DLRM_Net(
+        m_spa,
+        ln_emb,
+        ln_bot,
+        ln_top,
+        arch_interaction_op=args.arch_interaction_op,
+        arch_interaction_itself=args.arch_interaction_itself,
+        sigmoid_bot=-1,
+        sigmoid_top=ln_top.size - 2,
+        sync_dense_params=args.sync_dense_params,
+        loss_threshold=args.loss_threshold,
+        ndevices=ndevices,
+        qr_flag=args.qr_flag,
+        qr_operation=args.qr_operation,
+        qr_collisions=args.qr_collisions,
+        qr_threshold=args.qr_threshold,
+        md_flag=args.md_flag,
+        md_threshold=args.md_threshold,
+        weighted_pooling=args.weighted_pooling,
+        loss_function=args.loss_function,
+        learning_rate=args.learning_rate,
+        use_gpu=use_gpu,
+        use_fbgemm_gpu=use_fbgemm_gpu,
+        fbgemm_gpu_codegen_pref=args.fbgemm_gpu_codegen_pref,
+        inference_only=args.inference_only,
+        quantize_mlp_with_bit=args.quantize_mlp_with_bit,
+        quantize_emb_with_bit=args.quantize_emb_with_bit,
+    )
+
+    # test prints
+    if args.debug_mode:
+        print("initial parameters (weights and bias):")
+        dlrm.print_weights()
+
+    # In dlrm.quantize_embedding called below, the torch quantize calls run
+    # on cpu tensors only. They cannot quantize tensors stored on the gpu.
+    # So quantization occurs on cpu tensors before transferring them to gpu if
+    # use_gpu is enabled.
+    if args.quantize_emb_with_bit != 32:
+        dlrm.quantize_embedding(args.quantize_emb_with_bit)
+
+    if not args.inference_only:
+        assert args.quantize_mlp_with_bit == 32, (
+            "Dynamic quantization for mlp requires "
+            + "--inference-only because training is not supported"
+        )
+    else:
+        # Currently only INT8 and FP16 quantized types are supported for quantized MLP inference.
+        # By default we don't do the quantization: quantize_{mlp,emb}_with_bit == 32 (FP32)
+        assert args.quantize_mlp_with_bit in [
+            8,
+            16,
+            32,
+        ], "only support 8/16/32-bit but got {}".format(args.quantize_mlp_with_bit)
+
+        if args.quantize_mlp_with_bit != 32:
+            assert not use_gpu, (
+                "Cannot run dynamic quantization for mlp "
+                + "with --use-gpu enabled, because DynamicQuantizedLinear's "
+                + "forward call calls 'quantized::linear_dynamic', which cannot "
+                + "run with arguments from the 'CUDA' backend."
+            )
+            if args.quantize_mlp_with_bit in [8]:
+                quantize_dtype = torch.qint8
+            else:
+                quantize_dtype = torch.float16
+            dlrm.top_l = torch.quantization.quantize_dynamic(
+                dlrm.top_l, {torch.nn.Linear}, quantize_dtype
+            )
+            dlrm.bot_l = torch.quantization.quantize_dynamic(
+                dlrm.bot_l, {torch.nn.Linear}, quantize_dtype
+            )
+
+    # Prep work for embedding tables and model transfer:
+    # Handling single-cpu and single-gpu modes
+    # NOTE: This also handles dist-backend modes (CLI args --dist-backend=nccl,
+    # --dist-backend=ccl, and --dist-backend=mpi) because in these modes each
+    # process runs in single-gpu mode. For example, if 8 processes are launched
+    # running dlrm_s_pytorch.py with --dist-backend=nccl --use-gpu, each process
+    # will run in single-gpu mode, resulting in 8 gpus total running distributed
+    # training or distributed inference if --inference-only is enabled.
+    if dlrm.ndevices_available <= 1:
+        if use_fbgemm_gpu:
+            dlrm.fbgemm_emb_l = nn.ModuleList(
+                [
+                    fbgemm_gpu_emb_bag_wrapper(
+                        device,
+                        dlrm.emb_l if dlrm.emb_l else dlrm.emb_l_q,
+                        dlrm.m_spa,
+                        dlrm.quantize_bits,
+                        dlrm.learning_rate,
+                        dlrm.fbgemm_gpu_codegen_pref,
+                        dlrm.requires_grad,
+                    )
+                ]
+            )
+        if use_gpu:
+            dlrm = dlrm.to(device)
+            if dlrm.weighted_pooling == "fixed":
+                for k, w in enumerate(dlrm.v_W_l):
+                    dlrm.v_W_l[k] = w.cuda()
+    else:
+        # Handing Multi-gpu mode
+        dlrm.bot_l = dlrm.bot_l.to(device)
+        dlrm.top_l = dlrm.top_l.to(device)
+        dlrm.prepare_parallel_model(ndevices)
+
+    if args.use_torch2trt_for_mlp:
+        if torch2trt and use_gpu and args.inference_only and args.quantize_mlp_with_bit == 32:
+            bot_l_sample_input = torch.ones([1, ln_bot[0]], dtype=torch.float32).cuda()
+            top_l_sample_input = torch.ones([1, ln_top[0]], dtype=torch.float32).cuda()
+            dlrm.bot_l = torch2trt.torch2trt(dlrm.bot_l, (bot_l_sample_input,))
+            dlrm.top_l = torch2trt.torch2trt(dlrm.top_l, (top_l_sample_input,))
+        elif torch2trt is None:
+            sys.exit("\ntorch2trt module failed to import.\n\n" + torch2trt_import_error_msg)
+        else:
+            error_msg = "ERROR: When --use-torch2trt-for-mlp is enabled, "
+            if not use_gpu:
+                error_msg += "--use-gpu must be enabled, "
+            if not args.inference_only:
+                error_msg += "--inference-only must be enabled, "
+            if args.quantize_mlp_with_bit != 32:
+                error_msg += "--quantize-mlp-with-bit must be disabled. "
+            error_msg = error_msg[:-2] + "."
+            sys.exit(error_msg)
+
+    # distribute data parallel mlps
+    if ext_dist.my_size > 1:
+        if use_gpu:
+            device_ids = [ext_dist.my_local_rank]
+            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l, device_ids=device_ids)
+            dlrm.top_l = ext_dist.DDP(dlrm.top_l, device_ids=device_ids)
+        else:
+            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l)
+            dlrm.top_l = ext_dist.DDP(dlrm.top_l)
+
+    if not args.inference_only:
+        # specify the optimizer algorithm
+        opts = {
+            "sgd": torch.optim.SGD,
+            "rwsadagrad": RowWiseSparseAdagrad.RWSAdagrad,
+            "adagrad": apex.optimizers.FusedAdagrad
+            if apex
+            else torch.optim.Adagrad,
+        }
+
+        parameters = (
+            dlrm.parameters()
+            if ext_dist.my_size == 1
+            else [
+                {
+                    "params": [
+                        p
+                        for emb in (
+                            [e.fbgemm_gpu_emb_bag for e in dlrm.fbgemm_emb_l]
+                            if use_fbgemm_gpu
+                            else dlrm.emb_l_q
+                            if dlrm.quantize_bits != 32
+                            else dlrm.emb_l
+                        )
+                        for p in emb.parameters()
+                    ],
+                    "lr": args.learning_rate,
+                },
+                # TODO check this lr setup
+                # bottom mlp has no data parallelism
+                # need to check how do we deal with top mlp
+                {
+                    "params": dlrm.bot_l.parameters(),
+                    "lr": args.learning_rate,
+                },
+                {
+                    "params": dlrm.top_l.parameters(),
+                    "lr": args.learning_rate,
+                },
+            ]
+        )
+        optimizer = opts[args.optimizer](parameters, lr=args.learning_rate)
+        lr_scheduler = LRPolicyScheduler(
+            optimizer,
+            args.lr_num_warmup_steps,
+            args.lr_decay_start_step,
+            args.lr_num_decay_steps,
+        )
+
+    # Guarantee GPU setup has completed before training or inference starts.
+    if use_gpu:
+        torch.cuda.synchronize()
+
+    ### main loop ###
+
+    # training or inference
+    best_acc_test = 0
+    best_auc_test = 0
+    skip_upto_epoch = 0
+    skip_upto_batch = 0
+    total_time = 0
+    total_loss = 0
+    total_iter = 0
+    total_samp = 0
+
+    if args.mlperf_logging:
+        mlperf_logger.mlperf_submission_log("dlrm")
+        mlperf_logger.log_event(
+            key=mlperf_logger.constants.SEED, value=args.numpy_rand_seed
+        )
+        mlperf_logger.log_event(
+            key=mlperf_logger.constants.GLOBAL_BATCH_SIZE, value=args.mini_batch_size
+        )
+
+    # Load model is specified
+    if not (args.load_model == ""):
+        print("Loading saved model {}".format(args.load_model))
+        if use_gpu:
+            if dlrm.ndevices_available > 1:
+                # NOTE: when targeting inference on multiple GPUs,
+                # load the model as is on CPU or GPU, with the move
+                # to multiple GPUs to be done in parallel_forward
+                ld_model = torch.load(args.load_model)
+            else:
+                # NOTE: when targeting inference on single GPU,
+                # note that the call to .to(device) has already happened
+                ld_model = torch.load(
+                    args.load_model,
+                    map_location=torch.device("cuda")
+                    # map_location=lambda storage, loc: storage.cuda(0)
+                )
+        else:
+            # when targeting inference on CPU
+            ld_model = torch.load(args.load_model, map_location=torch.device("cpu"))
+        dlrm.load_state_dict(ld_model["state_dict"])
+        ld_j = ld_model["iter"]
+        ld_k = ld_model["epoch"]
+        ld_nepochs = ld_model["nepochs"]
+        ld_nbatches = ld_model["nbatches"]
+        ld_nbatches_test = ld_model["nbatches_test"]
+        ld_train_loss = ld_model["train_loss"]
+        ld_total_loss = ld_model["total_loss"]
+        if args.mlperf_logging:
+            ld_gAUC_test = ld_model["test_auc"]
+        ld_acc_test = ld_model["test_acc"]
+        if not args.inference_only:
+            optimizer.load_state_dict(ld_model["opt_state_dict"])
+            best_acc_test = ld_acc_test
+            total_loss = ld_total_loss
+            skip_upto_epoch = ld_k  # epochs
+            skip_upto_batch = ld_j  # batches
+        else:
+            args.print_freq = ld_nbatches
+            args.test_freq = 0
+
+        print(
+            "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format(
+                ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test
+            )
+        )
+        print(
+            "Training state: loss = {:.6f}".format(
+                ld_train_loss,
+            )
+        )
+        if args.mlperf_logging:
+            print(
+                "Testing state: accuracy = {:3.3f} %, auc = {:.3f}".format(
+                    ld_acc_test * 100, ld_gAUC_test
+                )
+            )
+        else:
+            print("Testing state: accuracy = {:3.3f} %".format(ld_acc_test * 100))
+
+    print("time/loss/accuracy (if enabled):")
+
+    if args.mlperf_logging:
+        # LR is logged twice for now because of a compliance checker bug
+        mlperf_logger.log_event(
+            key=mlperf_logger.constants.OPT_BASE_LR, value=args.learning_rate
+        )
+        mlperf_logger.log_event(
+            key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS,
+            value=args.lr_num_warmup_steps,
+        )
+
+        # use logging keys from the official HP table and not from the logging library
+        mlperf_logger.log_event(
+            key="sgd_opt_base_learning_rate", value=args.learning_rate
+        )
+        mlperf_logger.log_event(
+            key="lr_decay_start_steps", value=args.lr_decay_start_step
+        )
+        mlperf_logger.log_event(
+            key="sgd_opt_learning_rate_decay_steps", value=args.lr_num_decay_steps
+        )
+        mlperf_logger.log_event(key="sgd_opt_learning_rate_decay_poly_power", value=2)
+
+    tb_file = "./" + args.tensor_board_filename
+    writer = SummaryWriter(tb_file)
+
+    # Pre-cache samples.
+    if args.precache_ml_data:
+        for _ in (test_ld if args.inference_only else train_ld):
+            pass
+
+    ext_dist.barrier()
+    with torch.autograd.profiler.profile(
+        args.enable_profiling, use_cuda=use_gpu, record_shapes=True
+    ) as prof:
+
+        if not args.inference_only:
+
+            if args.fb5logger is not None:
+                fb5logger = FB5Logger(args.fb5logger)
+                fb5logger.header("DLRM", "OOTB", "train", args.fb5config, score_metric=loggerconstants.EXPS)
+            
+            k = 0
+            while k < args.nepochs:
+                if args.mlperf_logging:
+                    mlperf_logger.barrier()
+                    mlperf_logger.log_start(
+                        key=mlperf_logger.constants.BLOCK_START,
+                        metadata={
+                            mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1),
+                            mlperf_logger.constants.EPOCH_COUNT: 1,
+                        },
+                    )
+                    mlperf_logger.barrier()
+                    mlperf_logger.log_start(
+                        key=mlperf_logger.constants.EPOCH_START,
+                        metadata={mlperf_logger.constants.EPOCH_NUM: (k + 1)},
+                    )
+
+                if k < skip_upto_epoch:
+                    continue
+
+                if args.print_accumulated_time:
+                    accum_time_begin = time_wrap(use_gpu)
+
+                if args.mlperf_logging:
+                    previous_iteration_time = None
+
+                for j, inputBatch in enumerate(train_ld):
+                    if j == 0 and args.save_onnx:
+                        X_onnx, lS_o_onnx, lS_i_onnx, _, _, _ = unpack_batch(inputBatch)
+
+                    if j < skip_upto_batch:
+                        continue
+
+                    if k == 0 and j == args.warmup_steps and args.fb5logger is not None:
+                        fb5logger.run_start()
+
+                    X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch)
+
+                    if args.mlperf_logging:
+                        current_time = time_wrap(use_gpu)
+                        if previous_iteration_time:
+                            iteration_time = current_time - previous_iteration_time
+                        else:
+                            iteration_time = 0
+                        previous_iteration_time = current_time
+                    else:
+                        t1 = time_wrap(use_gpu)
+
+                    # early exit if nbatches was set by the user and has been exceeded
+                    if nbatches > 0 and j >= nbatches:
+                        break
+
+                    # Skip the batch if batch size not multiple of total ranks
+                    if ext_dist.my_size > 1 and X.size(0) % ext_dist.my_size != 0:
+                        print(
+                            "Warning: Skiping the batch %d with size %d"
+                            % (j, X.size(0))
+                        )
+                        continue
+
+                    mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
+
+                    # forward pass
+                    Z = dlrm_wrap(
+                        X,
+                        lS_o,
+                        lS_i,
+                        use_gpu,
+                        device,
+                        ndevices=ndevices,
+                    )
+
+                    if ext_dist.my_size > 1:
+                        T = T[ext_dist.get_my_slice(mbs)]
+                        W = W[ext_dist.get_my_slice(mbs)]
+
+                    # loss
+                    E = loss_fn_wrap(Z, T, use_gpu, device)
+
+                    # compute loss and accuracy
+                    L = E.detach().cpu().numpy()  # numpy array
+                    # training accuracy is not disabled
+                    # S = Z.detach().cpu().numpy()  # numpy array
+                    # T = T.detach().cpu().numpy()  # numpy array
+
+                    # # print("res: ", S)
+
+                    # # print("j, train: BCE", j, L)
+
+                    # mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
+                    # A = np.sum((np.round(S, 0) == T).astype(np.uint8))
+
+                    with record_function("DLRM backward"):
+                        # Update optimizer parameters to train weights instantiated lazily in
+                        # the parallel_forward call.
+                        if dlrm.ndevices_available > 1 and dlrm.add_new_weights_to_params:
+
+                            # Pop any prior extra parameters. Priors may exist because
+                            # self.parallel_model_is_not_prepared is set back to True
+                            # when self.parallel_model_batch_size != batch_size.
+                            # Search "self.parallel_model_batch_size != batch_size" in code.
+                            if "lazy_params" in optimizer.param_groups[-1].keys():
+                                optimizer.param_groups.pop()
+
+                            # dlrm.v_W_l_l is a list of nn.ParameterLists, one ParameterList per gpu.
+                            # Flatten the list of nn.ParameterList to one nn.ParameterList,
+                            # and add it to the trainable params list.
+                            lazy_params = nn.ParameterList()
+                            if dlrm.weighted_pooling == "learned":
+                                lazy_params.extend(
+                                    nn.ParameterList(
+                                        [p for p_l in dlrm.v_W_l_l for p in p_l]
+                                    )
+                                )
+                            if dlrm.use_fbgemm_gpu:
+                                lazy_params.extend(
+                                    nn.ParameterList(
+                                        [
+                                            emb
+                                            for emb_ in dlrm.fbgemm_emb_l
+                                            for emb in emb_.fbgemm_gpu_emb_bag.parameters()
+                                        ]
+                                    )
+                                )
+                            lazy_params_dict = optimizer.param_groups[0]
+                            lazy_params_dict["lazy_params"] = True
+                            lazy_params_dict["params"] = lazy_params
+                            optimizer.param_groups.append(lazy_params_dict)
+                            dlrm.add_new_weights_to_params = False
+                            # Run "[[t.device.type for t in grp['params']] for grp in optimizer.param_groups]"
+                            # to view devices used by tensors in the param groups.
+
+                        # scaled error gradient propagation
+                        # (where we do not accumulate gradients across mini-batches)
+                        if (
+                            args.mlperf_logging
+                            and (j + 1) % args.mlperf_grad_accum_iter == 0
+                        ) or not args.mlperf_logging:
+                            optimizer.zero_grad()
+                        # backward pass
+                        E.backward()
+
+                        # optimizer
+                        if (
+                            args.mlperf_logging
+                            and (j + 1) % args.mlperf_grad_accum_iter == 0
+                        ) or not args.mlperf_logging:
+                            optimizer.step()
+                            lr_scheduler.step()
+
+                    if args.mlperf_logging:
+                        total_time += iteration_time
+                    else:
+                        t2 = time_wrap(use_gpu)
+                        total_time += t2 - t1
+
+                    total_loss += L * mbs
+                    total_iter += 1
+                    total_samp += mbs
+
+                    should_print = ((j + 1) % args.print_freq == 0) or (
+                        j + 1 == nbatches
+                    )
+                    should_test = (
+                        (args.test_freq > 0)
+                        and (args.data_generation in ["dataset", "random"])
+                        and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
+                    )
+
+                    # print time, loss and accuracy
+                    if should_print or should_test:
+                        gT = 1000.0 * total_time / total_iter if args.print_time else -1
+                        total_time = 0
+
+                        train_loss = total_loss / total_samp
+                        total_loss = 0
+
+                        str_run_type = (
+                            "inference" if args.inference_only else "training"
+                        )
+
+                        wall_time = ""
+                        if args.print_wall_time:
+                            wall_time = " ({})".format(time.strftime("%H:%M"))
+
+                        print(
+                            "Finished {} it {}/{} of epoch {}, {:.2f} ms/it,".format(
+                                str_run_type, j + 1, nbatches, k, gT
+                            )
+                            + " loss {:.6f}".format(train_loss)
+                            + wall_time,
+                            flush=True,
+                        )
+
+                        if args.print_accumulated_time and ext_dist.my_rank < 2:
+                            current_unix_time = time_wrap(use_gpu)
+                            ext_dist.orig_print(
+                                "Accumulated time so far: {} for process {} for step {} at {}".format(
+                                    current_unix_time - accum_time_begin,
+                                    ext_dist.my_rank,
+                                    j + 1,
+                                    current_unix_time,
+                                )
+                            )
+
+                        log_iter = nbatches * k + j + 1
+                        writer.add_scalar("Train/Loss", train_loss, log_iter)
+
+                        total_iter = 0
+                        total_samp = 0
+
+                    # testing
+                    if should_test:
+                        epoch_num_float = (j + 1) / len(train_ld) + k + 1
+                        if args.mlperf_logging:
+                            mlperf_logger.barrier()
+                            mlperf_logger.log_start(
+                                key=mlperf_logger.constants.EVAL_START,
+                                metadata={
+                                    mlperf_logger.constants.EPOCH_NUM: epoch_num_float
+                                },
+                            )
+
+                        # don't measure training iter time in a test iteration
+                        if args.mlperf_logging:
+                            previous_iteration_time = None
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
+                        )
+                        model_metrics_dict, is_best = inference(
+                            args,
+                            dlrm,
+                            best_acc_test,
+                            best_auc_test,
+                            test_ld,
+                            device,
+                            use_gpu,
+                            log_iter,
+                        )
+
+                        if (
+                            is_best
+                            and not (args.save_model == "")
+                            and not args.inference_only
+                        ):
+                            model_metrics_dict["epoch"] = k
+                            model_metrics_dict["iter"] = j + 1
+                            model_metrics_dict["train_loss"] = train_loss
+                            model_metrics_dict["total_loss"] = total_loss
+                            model_metrics_dict[
+                                "opt_state_dict"
+                            ] = optimizer.state_dict()
+                            print("Saving model to {}".format(args.save_model))
+                            torch.save(model_metrics_dict, args.save_model)
+
+                        if args.mlperf_logging:
+                            mlperf_logger.barrier()
+                            mlperf_logger.log_end(
+                                key=mlperf_logger.constants.EVAL_STOP,
+                                metadata={
+                                    mlperf_logger.constants.EPOCH_NUM: epoch_num_float
+                                },
+                            )
+
+                        # Uncomment the line below to print out the total time with overhead
+                        # print("Total test time for this group: {}" \
+                        # .format(time_wrap(use_gpu) - accum_test_time_begin))
+
+                        if (
+                            args.mlperf_logging
+                            and (args.mlperf_acc_threshold > 0)
+                            and (best_acc_test > args.mlperf_acc_threshold)
+                        ):
+                            print(
+                                "MLPerf testing accuracy threshold "
+                                + str(args.mlperf_acc_threshold)
+                                + " reached, stop training"
+                            )
+                            break
+
+                        if (
+                            args.mlperf_logging
+                            and (args.mlperf_auc_threshold > 0)
+                            and (best_auc_test > args.mlperf_auc_threshold)
+                        ):
+                            print(
+                                "MLPerf testing auc threshold "
+                                + str(args.mlperf_auc_threshold)
+                                + " reached, stop training"
+                            )
+                            if args.mlperf_logging:
+                                mlperf_logger.barrier()
+                                mlperf_logger.log_end(
+                                    key=mlperf_logger.constants.RUN_STOP,
+                                    metadata={
+                                        mlperf_logger.constants.STATUS: mlperf_logger.constants.SUCCESS
+                                    },
+                                )
+                            break
+                if k == 0 and args.fb5logger is not None:
+                    fb5logger.run_stop(nbatches - args.warmup_steps, args.mini_batch_size)
+                    
+                if args.mlperf_logging:
+                    mlperf_logger.barrier()
+                    mlperf_logger.log_end(
+                        key=mlperf_logger.constants.EPOCH_STOP,
+                        metadata={mlperf_logger.constants.EPOCH_NUM: (k + 1)},
+                    )
+                    mlperf_logger.barrier()
+                    mlperf_logger.log_end(
+                        key=mlperf_logger.constants.BLOCK_STOP,
+                        metadata={mlperf_logger.constants.FIRST_EPOCH_NUM: (k + 1)},
+                    )
+                k += 1  # nepochs
+            if args.mlperf_logging and best_auc_test <= args.mlperf_auc_threshold:
+                mlperf_logger.barrier()
+                mlperf_logger.log_end(
+                    key=mlperf_logger.constants.RUN_STOP,
+                    metadata={
+                        mlperf_logger.constants.STATUS: mlperf_logger.constants.ABORTED
+                    },
+                )
+        else:
+            print("Testing for inference only")
+            inference(
+                args,
+                dlrm,
+                best_acc_test,
+                best_auc_test,
+                test_ld,
+                device,
+                use_gpu,
+            )
+
+    # profiling
+    if args.enable_profiling:
+        time_stamp = str(datetime.datetime.now()).replace(" ", "_")
+        with open("dlrm_s_pytorch" + time_stamp + "_shape.prof", "w") as prof_f:
+            prof_f.write(
+                prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="self_cpu_time_total"
+                )
+            )
+        with open("dlrm_s_pytorch" + time_stamp + "_total.prof", "w") as prof_f:
+            prof_f.write(prof.key_averages().table(sort_by="self_cpu_time_total"))
+        prof.export_chrome_trace("dlrm_s_pytorch" + time_stamp + ".json")
+        # print(prof.key_averages().table(sort_by="cpu_time_total"))
+
+    # plot compute graph
+    if args.plot_compute_graph:
+        sys.exit(
+            "ERROR: Please install pytorchviz package in order to use the"
+            + " visualization. Then, uncomment its import above as well as"
+            + " three lines below and run the code again."
+        )
+        # V = Z.mean() if args.inference_only else E
+        # dot = make_dot(V, params=dict(dlrm.named_parameters()))
+        # dot.render('dlrm_s_pytorch_graph') # write .pdf file
+
+    # test prints
+    if not args.inference_only and args.debug_mode:
+        print("updated parameters (weights and bias):")
+        dlrm.print_weights()
+
+    # export the model in onnx
+    if args.save_onnx:
+        """
+        # workaround 1: tensor -> list
+        if torch.is_tensor(lS_i_onnx):
+            lS_i_onnx = [lS_i_onnx[j] for j in range(len(lS_i_onnx))]
+        # workaound 2: list -> tensor
+        lS_i_onnx = torch.stack(lS_i_onnx)
+        """
+        # debug prints
+        # print("inputs", X_onnx, lS_o_onnx, lS_i_onnx)
+        # print("output", dlrm_wrap(X_onnx, lS_o_onnx, lS_i_onnx, use_gpu, device))
+        dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx"
+        print("X_onnx.shape", X_onnx.shape)
+        if torch.is_tensor(lS_o_onnx):
+            print("lS_o_onnx.shape", lS_o_onnx.shape)
+        else:
+            for oo in lS_o_onnx:
+                print("oo.shape", oo.shape)
+        if torch.is_tensor(lS_i_onnx):
+            print("lS_i_onnx.shape", lS_i_onnx.shape)
+        else:
+            for ii in lS_i_onnx:
+                print("ii.shape", ii.shape)
+
+        # name inputs and outputs
+        o_inputs = (
+            ["offsets"]
+            if torch.is_tensor(lS_o_onnx)
+            else ["offsets_" + str(i) for i in range(len(lS_o_onnx))]
+        )
+        i_inputs = (
+            ["indices"]
+            if torch.is_tensor(lS_i_onnx)
+            else ["indices_" + str(i) for i in range(len(lS_i_onnx))]
+        )
+        all_inputs = ["dense_x"] + o_inputs + i_inputs
+        # debug prints
+        print("inputs", all_inputs)
+
+        # create dynamic_axis dictionaries
+        do_inputs = (
+            [{"offsets": {1: "batch_size"}}]
+            if torch.is_tensor(lS_o_onnx)
+            else [
+                {"offsets_" + str(i): {0: "batch_size"}} for i in range(len(lS_o_onnx))
+            ]
+        )
+        di_inputs = (
+            [{"indices": {1: "batch_size"}}]
+            if torch.is_tensor(lS_i_onnx)
+            else [
+                {"indices_" + str(i): {0: "batch_size"}} for i in range(len(lS_i_onnx))
+            ]
+        )
+        dynamic_axes = {"dense_x": {0: "batch_size"}, "pred": {0: "batch_size"}}
+        for do in do_inputs:
+            dynamic_axes.update(do)
+        for di in di_inputs:
+            dynamic_axes.update(di)
+        # debug prints
+        print(dynamic_axes)
+        # export model
+        torch.onnx.export(
+            dlrm,
+            (X_onnx, lS_o_onnx, lS_i_onnx),
+            dlrm_pytorch_onnx_file,
+            verbose=True,
+            use_external_data_format=True,
+            opset_version=11,
+            input_names=all_inputs,
+            output_names=["pred"],
+            dynamic_axes=dynamic_axes,
+        )
+        # recover the model back
+        dlrm_pytorch_onnx = onnx.load("dlrm_s_pytorch.onnx")
+        # check the onnx model
+        onnx.checker.check_model(dlrm_pytorch_onnx)
+    total_time_end = time_wrap(use_gpu)
+
+
+if __name__ == "__main__":
+    run()
diff --git a/benchmarks/dlrm/ootb/extend_distributed.py b/benchmarks/dlrm/ootb/extend_distributed.py
new file mode 100644
index 0000000..1f2c8a5
--- /dev/null
+++ b/benchmarks/dlrm/ootb/extend_distributed.py
@@ -0,0 +1,603 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import builtins
+import os
+import sys
+
+import torch
+import torch.distributed as dist
+from torch.autograd import Function
+from torch.autograd.profiler import record_function
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+
+try:
+    import torch_ccl
+except ImportError as e:
+    # print(e)
+    torch_ccl = False
+
+try:
+    import torch_ucc
+except ImportError as e:
+    torch_ucc = False
+
+
+my_rank = -1
+my_size = -1
+my_local_rank = -1
+my_local_size = -1
+alltoall_supported = False
+a2a_impl = os.environ.get("DLRM_ALLTOALL_IMPL", "")
+
+myreq = None
+
+
+def env2int(env_list, default=-1):
+    for e in env_list:
+        val = int(os.environ.get(e, -1))
+        if val >= 0:
+            return val
+    return default
+
+
+def get_my_slice(n):
+    k, m = divmod(n, my_size)
+    return slice(
+        my_rank * k + min(my_rank, m), (my_rank + 1) * k + min(my_rank + 1, m), 1
+    )
+
+
+def get_split_lengths(n):
+    k, m = divmod(n, my_size)
+    if m == 0:
+        splits = None
+        my_len = k
+    else:
+        splits = [(k + 1) if i < m else k for i in range(my_size)]
+        my_len = splits[my_rank]
+    return (my_len, splits)
+
+
+def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""):
+    global myreq
+    global my_rank
+    global my_size
+    global my_local_rank
+    global my_local_size
+    global a2a_impl
+    global alltoall_supported
+
+    # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
+    num_mpi_ranks = env2int(
+        ["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"]
+    )
+    if backend == "" and num_mpi_ranks > 1:
+        if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0:
+            backend = "ccl"
+        elif use_gpu and dist.is_nccl_available():
+            backend = "nccl"
+        elif dist.is_mpi_available():
+            backend = "mpi"
+        else:
+            print(
+                "WARNING: MPI multi-process launch detected but PyTorch MPI backend not available."
+            )
+            backend = "gloo"
+
+    if backend != "":
+        # guess Rank and size
+        if rank == -1:
+            rank = env2int(
+                ["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0
+            )
+        if size == -1:
+            size = env2int(
+                [
+                    "PMI_SIZE",
+                    "OMPI_COMM_WORLD_SIZE",
+                    "MV2_COMM_WORLD_SIZE",
+                    "WORLD_SIZE",
+                ],
+                1,
+            )
+        if not os.environ.get("RANK", None) and rank != -1:
+            os.environ["RANK"] = str(rank)
+        if not os.environ.get("WORLD_SIZE", None) and size != -1:
+            os.environ["WORLD_SIZE"] = str(size)
+        if not os.environ.get("MASTER_PORT", None):
+            os.environ["MASTER_PORT"] = "29500"
+        if not os.environ.get("MASTER_ADDR", None):
+            local_size = env2int(
+                [
+                    "MPI_LOCALNRANKS",
+                    "OMPI_COMM_WORLD_LOCAL_SIZE",
+                    "MV2_COMM_WORLD_LOCAL_SIZE",
+                ],
+                1,
+            )
+            if local_size != size and backend != "mpi":
+                print(
+                    "Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default"
+                )
+                print(
+                    "If this run hangs, try exporting rank 0's hostname as MASTER_ADDR"
+                )
+            os.environ["MASTER_ADDR"] = "127.0.0.1"
+
+    if size > 1:
+        if local_rank == -1:
+            my_local_rank = env2int(
+                [
+                    "MPI_LOCALRANKID",
+                    "OMPI_COMM_WORLD_LOCAL_RANK",
+                    "MV2_COMM_WORLD_LOCAL_RANK",
+                    "LOCAL_RANK",
+                ],
+                0,
+            )
+        else:
+            my_local_rank = local_rank
+        my_local_size = env2int(
+            [
+                "MPI_LOCALNRANKS",
+                "OMPI_COMM_WORLD_LOCAL_SIZE",
+                "MV2_COMM_WORLD_LOCAL_SIZE",
+            ],
+            1,
+        )
+        if use_gpu:
+            if my_local_size > torch.cuda.device_count():
+                print(
+                    "Not sufficient GPUs available... local_size = %d, ngpus = %d"
+                    % (my_local_size, torch.cuda.device_count())
+                )
+                sys.exit(1)
+            torch.cuda.set_device(my_local_rank)
+        dist.init_process_group(backend, rank=rank, world_size=size)
+        my_rank = dist.get_rank()
+        my_size = dist.get_world_size()
+        if my_rank == 0:
+            print("Running on %d ranks using %s backend" % (my_size, backend))
+        if hasattr(dist, "all_to_all_single"):
+            try:
+                t = torch.zeros([4])
+                if use_gpu:
+                    t = t.cuda()
+                dist.all_to_all_single(t, t)
+                alltoall_supported = True
+            except RuntimeError as err:
+                print("fail to enable all_to_all_single primitive: %s" % err)
+        if a2a_impl == "alltoall" and alltoall_supported == False:
+            print(
+                "Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall"
+                % (a2a_impl, backend)
+            )
+            a2a_impl = "scatter"
+        if a2a_impl != "":
+            print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl)
+    else:
+        my_rank = 0
+        my_size = 1
+        my_local_rank = 0
+        my_local_size = 1
+    print_all(
+        "world size: %d, current rank: %d, local rank: %d"
+        % (my_size, my_rank, my_local_rank)
+    )
+    myreq = Request()
+
+
+class Request(object):
+    def __init__(self):
+        self.req = None
+        self.tensor = None
+        self.WaitFunction = All2All_Scatter_Wait
+
+    def wait(self):
+        ret = self.WaitFunction.apply(*self.tensor)
+        self.req = None
+        self.tensor = None
+        return ret
+
+
+class All2All_ScatterList_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        gather_list = []
+        req_list = []
+        for i in range(my_size):
+            for j in range(table_split_lengths[i]):
+                out_tensor = inputs[0].new_empty(
+                    [a2a_info.local_batch_num, a2a_info.emb_dim]
+                )
+                scatter_list = (
+                    list(inputs[j].split(batch_split_lengths, dim=0))
+                    if i == my_rank
+                    else []
+                )
+                req = dist.scatter(out_tensor, scatter_list, src=i, async_op=True)
+                gather_list.append(out_tensor)
+                req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = tuple(gather_list)
+        myreq.a2a_info = a2a_info
+        return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        grad_inputs = myreq.tensor
+        myreq.tensor = None
+        return (None, *grad_inputs)
+
+
+class All2All_ScatterList_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        ctx.a2a_info = myreq.a2a_info
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        myreq.tensor = None
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        a2a_info = ctx.a2a_info
+        grad_output = [t.contiguous() for t in grad_output]
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else [a2a_info.local_batch_num] * my_size
+        )
+        per_rank_table_splits = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        grad_inputs = [
+            grad_output[0].new_empty([ctx.a2a_info.batch_size, ctx.a2a_info.emb_dim])
+            for _ in range(a2a_info.local_table_num)
+        ]
+        req_list = []
+        ind = 0
+        for i in range(my_size):
+            for j in range(per_rank_table_splits[i]):
+                gather_list = (
+                    list(grad_inputs[j].split(batch_split_lengths, dim=0))
+                    if i == my_rank
+                    else None
+                )
+                req = dist.gather(grad_output[ind], gather_list, dst=i, async_op=True)
+                req_list.append(req)
+                ind += 1
+        myreq.req = req_list
+        myreq.tensor = grad_inputs
+        return tuple(grad_output)
+
+
+class All2All_Scatter_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        input = torch.cat(inputs, dim=1)
+        scatter_list = list(input.split(batch_split_lengths, dim=0))
+        gather_list = []
+        req_list = []
+        for i in range(my_size):
+            out_tensor = input.new_empty(
+                [a2a_info.local_batch_num, table_split_lengths[i] * a2a_info.emb_dim]
+            )
+            req = dist.scatter(
+                out_tensor, scatter_list if i == my_rank else [], src=i, async_op=True
+            )
+            gather_list.append(out_tensor)
+            req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = tuple(gather_list)
+        myreq.a2a_info = a2a_info
+        ctx.a2a_info = a2a_info
+        return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        grad_input = myreq.tensor
+        grad_inputs = grad_input.split(ctx.a2a_info.emb_dim, dim=1)
+        myreq.tensor = None
+        return (None, *grad_inputs)
+
+
+class All2All_Scatter_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        ctx.a2a_info = myreq.a2a_info
+        for r in myreq.req:
+            r.wait()
+        myreq.req = None
+        myreq.tensor = None
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        assert len(grad_output) == my_size
+        scatter_list = [t.contiguous() for t in grad_output]
+        a2a_info = ctx.a2a_info
+        batch_split_lengths = (
+            a2a_info.global_batch_partition_slices
+            if a2a_info.global_batch_partition_slices
+            else a2a_info.local_batch_num
+        )
+        table_split_lengths = (
+            a2a_info.global_table_wise_parition_slices
+            if a2a_info.global_table_wise_parition_slices
+            else [a2a_info.local_table_num] * my_size
+        )
+        grad_input = grad_output[0].new_empty(
+            [a2a_info.batch_size, a2a_info.emb_dim * a2a_info.local_table_num]
+        )
+        gather_list = list(grad_input.split(batch_split_lengths, dim=0))
+        req_list = []
+        for i in range(my_size):
+            req = dist.gather(
+                scatter_list[i],
+                gather_list if i == my_rank else [],
+                dst=i,
+                async_op=True,
+            )
+            req_list.append(req)
+        myreq.req = req_list
+        myreq.tensor = grad_input
+        return grad_output
+
+
+class All2All_Req(Function):
+    @staticmethod
+    def forward(ctx, a2a_info, *inputs):
+        global myreq
+        with record_function("DLRM alltoall_req_fwd_single"):
+            batch_split_lengths = a2a_info.global_batch_partition_slices
+            if batch_split_lengths:
+                batch_split_lengths = [
+                    m * a2a_info.emb_dim * a2a_info.local_table_num
+                    for m in batch_split_lengths
+                ]
+            table_split_lengths = a2a_info.global_table_wise_parition_slices
+            if table_split_lengths:
+                table_split_lengths = [
+                    a2a_info.local_batch_num * e * a2a_info.emb_dim
+                    for e in table_split_lengths
+                ]
+            input = torch.cat(inputs, dim=1).view([-1])
+            output = input.new_empty(
+                [
+                    a2a_info.global_table_num
+                    * a2a_info.local_batch_num
+                    * a2a_info.emb_dim
+                ]
+            )
+            req = dist.all_to_all_single(
+                output, input, table_split_lengths, batch_split_lengths, async_op=True
+            )
+
+            myreq.req = req
+            myreq.tensor = []
+            myreq.tensor.append(output)
+            myreq.tensor = tuple(myreq.tensor)
+            a2a_info.batch_split_lengths = batch_split_lengths
+            a2a_info.table_split_lengths = table_split_lengths
+            myreq.a2a_info = a2a_info
+            ctx.a2a_info = a2a_info
+            return myreq.tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        global myreq
+        with record_function("DLRM alltoall_req_bwd_single"):
+            a2a_info = ctx.a2a_info
+            myreq.req.wait()
+            myreq.req = None
+            grad_input = myreq.tensor
+            grad_inputs = grad_input.view([a2a_info.batch_size, -1]).split(
+                a2a_info.emb_dim, dim=1
+            )
+            grad_inputs = [gin.contiguous() for gin in grad_inputs]
+            myreq.tensor = None
+            return (None, *grad_inputs)
+
+
+class All2All_Wait(Function):
+    @staticmethod
+    def forward(ctx, *output):
+        global myreq
+        with record_function("DLRM alltoall_wait_fwd_single"):
+            a2a_info = myreq.a2a_info
+            ctx.a2a_info = a2a_info
+            myreq.req.wait()
+            myreq.req = None
+            myreq.tensor = None
+            table_split_lengths = (
+                a2a_info.table_split_lengths
+                if a2a_info.table_split_lengths
+                else a2a_info.local_table_num
+                * a2a_info.local_batch_num
+                * a2a_info.emb_dim
+            )
+            outputs = output[0].split(table_split_lengths)
+            outputs = tuple(
+                [out.view([a2a_info.local_batch_num, -1]) for out in outputs]
+            )
+            return outputs
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        global myreq
+        with record_function("DLRM alltoall_wait_bwd_single"):
+            a2a_info = ctx.a2a_info
+            grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs]
+            grad_output = torch.cat(grad_outputs)
+            grad_input = grad_output.new_empty(
+                [a2a_info.batch_size * a2a_info.local_table_num * a2a_info.emb_dim]
+            )
+            req = dist.all_to_all_single(
+                grad_input,
+                grad_output,
+                a2a_info.batch_split_lengths,
+                a2a_info.table_split_lengths,
+                async_op=True,
+            )
+            myreq.req = req
+            myreq.tensor = grad_input
+            return (grad_output,)
+
+
+class AllGather(Function):
+    @staticmethod
+    def forward(ctx, input, global_lengths, dim=0):
+        if not isinstance(global_lengths, (list, tuple)):
+            global_lengths = [global_lengths] * my_size
+
+        assert len(global_lengths) == my_size
+        assert global_lengths[my_rank] == input.size(dim)
+        local_start = sum(global_lengths[:my_rank])
+
+        output_size = list(input.size())
+
+        ctx.dim = dim
+        ctx.local_start = local_start
+        ctx.local_length = global_lengths[my_rank]
+
+        input = input.contiguous()
+        if dim == 0:
+            out_len = sum(global_lengths)
+            output_size[dim] = out_len
+            output = input.new_empty(output_size)
+            gather_list = list(output.split(global_lengths, dim=0))
+        else:
+            gather_list = [torch.empty_like(input) for _ in range(my_size)]
+            gather_list = []
+            for length in global_lengths:
+                output_size[dim] = length
+                gather_list.append(input.new_empty(output_size))
+
+        dist.all_gather(gather_list, input)
+
+        if dim != 0:
+            output = torch.cat(gather_list, dim=dim)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # print("Inside All2AllBackward")
+        dim = ctx.dim
+        start = ctx.local_start
+        length = ctx.local_length
+
+        grad_input = grad_output.narrow(dim, start, length)
+
+        return (grad_input, None, None)
+
+
+class All2AllInfo(object):
+    pass
+
+
+def alltoall(inputs, per_rank_table_splits):
+    global myreq
+    batch_size, emb_dim = inputs[0].size()
+    a2a_info = All2AllInfo()
+    a2a_info.local_table_num = len(inputs)
+    a2a_info.global_table_wise_parition_slices = per_rank_table_splits
+    (
+        a2a_info.local_batch_num,
+        a2a_info.global_batch_partition_slices,
+    ) = get_split_lengths(batch_size)
+    a2a_info.emb_dim = emb_dim
+    a2a_info.batch_size = batch_size
+    a2a_info.global_table_num = (
+        sum(per_rank_table_splits)
+        if per_rank_table_splits
+        else a2a_info.local_table_num * my_size
+    )
+
+    if a2a_impl == "" and alltoall_supported or a2a_impl == "alltoall":
+        # print("Using All2All_Req")
+        output = All2All_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_Wait
+    elif a2a_impl == "" or a2a_impl == "scatter":
+        # print("Using All2All_Scatter_Req")
+        output = All2All_Scatter_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_Scatter_Wait
+    elif a2a_impl == "scatter_list":
+        # print("Using All2All_ScatterList_Req")
+        output = All2All_ScatterList_Req.apply(a2a_info, *inputs)
+        myreq.WaitFunction = All2All_ScatterList_Wait
+    else:
+        print(
+            "Unknown value set for DLRM_ALLTOALL_IMPL (%s), "
+            "please use one of [alltoall, scatter, scatter_list]" % a2a_impl
+        )
+    return myreq
+
+
+def all_gather(input, lengths, dim=0):
+    if not lengths:
+        lengths = [input.size(0)] * my_size
+    return AllGather.apply(input, lengths, dim)
+
+
+def barrier():
+    if my_size > 1:
+        dist.barrier()
+
+
+# Override builtin print function to print only from rank 0
+orig_print = builtins.print
+
+
+def rank0_print(*args, **kwargs):
+    if my_rank <= 0 or kwargs.get("print_all", False):
+        orig_print(*args, **kwargs)
+
+
+builtins.print = rank0_print
+
+# Allow printing from all rank with explicit print_all
+def print_all(*args, **kwargs):
+    orig_print(*args, **kwargs)
diff --git a/benchmarks/dlrm/ootb/input/dist_emb_0.log b/benchmarks/dlrm/ootb/input/dist_emb_0.log
new file mode 100644
index 0000000..7a8c1b7
--- /dev/null
+++ b/benchmarks/dlrm/ootb/input/dist_emb_0.log
@@ -0,0 +1,3 @@
+1, 2, 3, 4, 5, 6
+0, 1, 3, 4, 5
+0.55, 0.64, 0.82, 0.91, 1.0
diff --git a/benchmarks/dlrm/ootb/input/dist_emb_1.log b/benchmarks/dlrm/ootb/input/dist_emb_1.log
new file mode 100644
index 0000000..7a8c1b7
--- /dev/null
+++ b/benchmarks/dlrm/ootb/input/dist_emb_1.log
@@ -0,0 +1,3 @@
+1, 2, 3, 4, 5, 6
+0, 1, 3, 4, 5
+0.55, 0.64, 0.82, 0.91, 1.0
diff --git a/benchmarks/dlrm/ootb/input/dist_emb_2.log b/benchmarks/dlrm/ootb/input/dist_emb_2.log
new file mode 100644
index 0000000..7a8c1b7
--- /dev/null
+++ b/benchmarks/dlrm/ootb/input/dist_emb_2.log
@@ -0,0 +1,3 @@
+1, 2, 3, 4, 5, 6
+0, 1, 3, 4, 5
+0.55, 0.64, 0.82, 0.91, 1.0
diff --git a/benchmarks/dlrm/ootb/input/trace.log b/benchmarks/dlrm/ootb/input/trace.log
new file mode 100644
index 0000000..4d33e55
--- /dev/null
+++ b/benchmarks/dlrm/ootb/input/trace.log
@@ -0,0 +1 @@
+1, 2, 3, 4, 5, 3, 4, 1, 1, 6, 3
diff --git a/benchmarks/dlrm/ootb/kaggle_dac_loss_accuracy_plots.png b/benchmarks/dlrm/ootb/kaggle_dac_loss_accuracy_plots.png
new file mode 100644
index 0000000..aaa51f3
Binary files /dev/null and b/benchmarks/dlrm/ootb/kaggle_dac_loss_accuracy_plots.png differ
diff --git a/benchmarks/dlrm/ootb/mlperf_logger.py b/benchmarks/dlrm/ootb/mlperf_logger.py
new file mode 100644
index 0000000..efce1d3
--- /dev/null
+++ b/benchmarks/dlrm/ootb/mlperf_logger.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""
+Utilities for MLPerf logging
+"""
+import os
+import torch
+
+try:
+    from mlperf_logging import mllog
+    from mlperf_logging.mllog import constants
+    _MLLOGGER = mllog.get_mllogger()
+except ImportError as error:
+        print("Unable to import mlperf_logging, ", error)
+
+
+def log_start(*args, **kwargs):
+    "log with start tag"
+    _log_print(_MLLOGGER.start, *args, **kwargs)
+
+
+def log_end(*args, **kwargs):
+    "log with end tag"
+    _log_print(_MLLOGGER.end, *args, **kwargs)
+
+
+def log_event(*args, **kwargs):
+    "log with event tag"
+    _log_print(_MLLOGGER.event, *args, **kwargs)
+
+
+def _log_print(logger, *args, **kwargs):
+    "makes mlperf logger aware of distributed execution"
+    if 'stack_offset' not in kwargs:
+        kwargs['stack_offset'] = 3
+    if 'value' not in kwargs:
+        kwargs['value'] = None
+
+    if kwargs.pop('log_all_ranks', False):
+        log = True
+    else:
+        log = (get_rank() == 0)
+
+    if log:
+        logger(*args, **kwargs)
+
+
+def config_logger(benchmark):
+    "initiates mlperf logger"
+    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
+    _MLLOGGER.logger.propagate = False
+
+
+def barrier():
+    """
+    Works as a temporary distributed barrier, currently pytorch
+    doesn't implement barrier for NCCL backend.
+    Calls all_reduce on dummy tensor and synchronizes with GPU.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+        torch.cuda.synchronize()
+
+
+def get_rank():
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
+
+
+def mlperf_submission_log(benchmark):
+    """
+    Logs information needed for MLPerf submission
+    """
+
+    config_logger(benchmark)
+
+    log_event(
+        key=constants.SUBMISSION_BENCHMARK,
+        value=benchmark,
+        )
+
+    log_event(
+        key=constants.SUBMISSION_ORG,
+        value='reference_implementation')
+
+    log_event(
+        key=constants.SUBMISSION_DIVISION,
+        value='closed')
+
+    log_event(
+        key=constants.SUBMISSION_STATUS,
+        value='onprem')
+
+    log_event(
+        key=constants.SUBMISSION_PLATFORM,
+        value='reference_implementation')
+
+    log_event(
+        key=constants.SUBMISSION_ENTRY,
+        value="reference_implementation")
+
+    log_event(
+        key=constants.SUBMISSION_POC_NAME,
+        value='reference_implementation')
+
+    log_event(
+        key=constants.SUBMISSION_POC_EMAIL,
+        value='reference_implementation')
diff --git a/benchmarks/dlrm/ootb/optim/rwsadagrad.py b/benchmarks/dlrm/ootb/optim/rwsadagrad.py
new file mode 100644
index 0000000..95381ec
--- /dev/null
+++ b/benchmarks/dlrm/ootb/optim/rwsadagrad.py
@@ -0,0 +1,122 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from torch.optim import Optimizer
+
+
+class RWSAdagrad(Optimizer):
+    """Implements Row Wise Sparse Adagrad algorithm.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lr_decay (float, optional): learning rate decay (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-10)
+
+    """
+
+    def __init__(self, params, lr=1e-2, lr_decay=0.0, weight_decay=0.0, initial_accumulator_value=0.0, eps=1e-10):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= lr_decay:
+            raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= initial_accumulator_value:
+            raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+
+        self.defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
+                        initial_accumulator_value=initial_accumulator_value)
+        super(RWSAdagrad, self).__init__(params, self.defaults)
+
+        self.momentum_initialized = False
+
+        for group in self.param_groups:
+            for p in group['params']:
+                self.state[p]['step'] = 0
+
+    def share_memory(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if p.grad.data.is_sparse:
+                    state['momentum'].share_memory_()
+                else:
+                    state['sum'].share_memory_()
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                if not self.momentum_initialized :
+                    if p.grad.data.is_sparse:
+                        self.state[p]['momentum'] = torch.full(
+                            [p.data.shape[0]],
+                            self.defaults["initial_accumulator_value"],
+                            dtype=torch.float32,
+                        )
+                    else:
+                        self.state[p]['sum'] = torch.full_like(p.data,
+                            self.defaults["initial_accumulator_value"],
+                            dtype=torch.float32,
+                        )
+
+                grad = p.grad
+                state = self.state[p]
+
+                state['step'] += 1
+
+                if group['weight_decay'] != 0:
+                    if p.grad.data.is_sparse:
+                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                clr = group['lr'] / (1.0 + (state['step'] - 1.0) * group['lr_decay'])
+
+                if grad.is_sparse:
+                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
+                    grad_indices = grad._indices()
+                    grad_values = grad._values()
+                    size = grad.size()
+
+                    def make_sparse(values, row_wise):
+                        constructor = grad.new
+                        matrix_size = [size[0]] if row_wise else size
+                        return constructor(grad_indices, values, matrix_size)
+
+                    if grad_values.numel() > 0:
+                        momentum_update = make_sparse(grad_values.pow(2).mean(dim=1), True)
+                        state['momentum'].add_(momentum_update)  # update momentum
+                        std = state['momentum'].sparse_mask(momentum_update.coalesce())
+                        std_values = std._values().sqrt_().add_(group['eps'])
+                        p.data.add_(make_sparse(grad_values / std_values.view(std_values.size()[0], 1), False), alpha=-clr)
+
+                else:
+                    state['sum'].addcmul_(grad, grad, value=1.0)
+                    std = state['sum'].sqrt().add_(group['eps'])
+                    p.data.addcdiv_(grad, std, value=-clr)
+
+        self.momentum_initialized = True
+
+        return loss
diff --git a/benchmarks/dlrm/ootb/requirements.txt b/benchmarks/dlrm/ootb/requirements.txt
new file mode 100644
index 0000000..b198a12
--- /dev/null
+++ b/benchmarks/dlrm/ootb/requirements.txt
@@ -0,0 +1,8 @@
+future
+numpy
+onnx
+pydot
+torch
+torchviz
+scikit-learn
+tqdm
diff --git a/benchmarks/dlrm/ootb/test/dlrm_s_test.sh b/benchmarks/dlrm/ootb/test/dlrm_s_test.sh
new file mode 100755
index 0000000..e504545
--- /dev/null
+++ b/benchmarks/dlrm/ootb/test/dlrm_s_test.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have compiled PyTorch and caffe2
+
+#check if extra argument is passed to the test
+if [[ $# == 1 ]]; then
+    dlrm_extra_option=$1
+else
+    dlrm_extra_option=""
+fi
+#echo $dlrm_extra_option
+
+dlrm_py="python dlrm_s_pytorch.py"
+dlrm_c2="python dlrm_s_caffe2.py"
+
+echo "Running commands ..."
+#run pytorch
+echo $dlrm_py
+$dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1
+$dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2
+$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3
+$dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4
+
+#run caffe2
+echo $dlrm_c2
+$dlrm_c2 --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc1
+$dlrm_c2 --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc2
+$dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc3
+$dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc4
+
+echo "Checking results ..."
+#check results
+#WARNING: correct test will have no difference in numeric values
+#(but might have some verbal difference, e.g. due to warnnings)
+#in the output file
+echo "diff test1 (no numeric values in the output = SUCCESS)"
+diff ccc1 ppp1
+echo "diff test2 (no numeric values in the output = SUCCESS)"
+diff ccc2 ppp2
+echo "diff test3 (no numeric values in the output = SUCCESS)"
+diff ccc3 ppp3
+echo "diff test4 (no numeric values in the output = SUCCESS)"
+diff ccc4 ppp4
diff --git a/benchmarks/dlrm/ootb/test/dlrm_s_test_fbgemm_gpu.sh b/benchmarks/dlrm/ootb/test/dlrm_s_test_fbgemm_gpu.sh
new file mode 100644
index 0000000..c699043
--- /dev/null
+++ b/benchmarks/dlrm/ootb/test/dlrm_s_test_fbgemm_gpu.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#WARNING: must have fbgemm_gpu module to run these tests.
+
+echo -e "\nConsistency test: fbgemm_gpu -compared-with- PyTorch emb ops"
+dlrm_base_config_="python dlrm_s_pytorch.py --arch-sparse-feature-size=172 --arch-mlp-bot=1559-2500-2500-172 --arch-mlp-top=2000-2000-2000-1 --arch-embedding-size=213728-213728-213728-213728-213728-213728-213728-213728 --mini-batch-size=64 --num-indices-per-lookup-fixed=1 --num-indices-per-lookup=16 --num-batches=1 --nepochs=3 --debug-mode"
+
+for weighted_pooling in '' ' --weighted-pooling=fixed' ' --weighted-pooling=learned';
+do
+    dlrm_base_config=$dlrm_base_config_$weighted_pooling
+
+    echo -e "\n======================================================"
+    echo "Testing 32-bit embeddings"
+
+    dlrm_config="$dlrm_base_config"
+    echo "---GROUND TRUTH--- using PyTorch emb ops on CPU"
+    echo "$dlrm_config"
+    $dlrm_config > aaa1
+    echo "---COMPARISON--- using fbgemm_gpu on CPU"
+    echo "$dlrm_config --use-fbgemm-gpu"
+    $dlrm_config --use-fbgemm-gpu > aaa2
+    echo "diff GT & COMP (no numeric values in the output = SUCCESS)"
+    diff aaa1 aaa2
+
+    echo "---GROUND TRUTH--- using PyTorch emb ops on GPU"
+    echo "$dlrm_config --use-gpu"
+    $dlrm_config --use-gpu > bbb1
+    echo "---COMPARISON--- using fbgemm_gpu on GPU"
+    echo "$dlrm_config --use-gpu --use-fbgemm-gpu"
+    $dlrm_config --use-fbgemm-gpu --use-gpu > bbb2
+    echo "diff GT & COMP (no numeric values in the output = SUCCESS)"
+    diff bbb1 bbb2
+
+    echo -e "\n======================================================"
+    echo "Testing 8-bit quantized embeddings, inference only"
+    dlrm_config="$dlrm_base_config --inference-only --quantize-emb-with-bit=8"
+
+    echo "---GROUND TRUTH--- using PyTorch emb ops on CPU"
+    echo "$dlrm_config"
+    $dlrm_config > ccc1
+
+    echo "---COMPARISON--- using fbgemm_gpu on CPU"
+    echo "$dlrm_config --use-fbgemm-gpu"
+    $dlrm_config --use-fbgemm-gpu > ccc2
+    echo "diff GT & COMP (no numeric values in the output = SUCCESS)"
+    diff ccc1 ccc2
+done
diff --git a/benchmarks/dlrm/ootb/tools/visualize.py b/benchmarks/dlrm/ootb/tools/visualize.py
new file mode 100755
index 0000000..f16504c
--- /dev/null
+++ b/benchmarks/dlrm/ootb/tools/visualize.py
@@ -0,0 +1,1030 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#
+# This script performs the visualization of the embedding tables created in
+# DLRM during the training procedure. We use two popular techniques for
+# visualization: umap (https://umap-learn.readthedocs.io/en/latest/) and
+# tsne (https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html).
+# These links also provide instructions on how to install these packages
+# in different environments.
+#
+# Warning: the size of the data to be visualized depends on the RAM on your machine.
+#
+#
+# Connand line examples:
+#
+# Full analysis of embeddings and data representations for Criteo Kaggle data:
+# $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 
+#         --raw-data-file=../../criteo/input/train.txt --skip-categorical-analysis 
+#         --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz
+#
+#
+# To run just the analysis of categoricala data for Criteo Kaggle data set:
+# $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 \
+#         --raw-data-file=../../criteo/input/train.txt --data-randomize=none --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz \
+#         --skip-embedding --skip-data-plots
+#
+#
+# The following command line arguments are available to the user:
+#
+#    --load-model                   - DLRM model file
+#    --data-set                     - one of ["kaggle", "terabyte"]
+#    --max-ind-range                - max index range used during the traning
+#    --output-dir                   - output directory, if not specified, it will be traeted from the model and datset names
+#    --max-umap-size                - max number of points to visualize using UMAP, default=50000
+#    --use-tsne                     - use T-SNE
+#    --max-tsne-size                - max number of points to visualize using T-SNE, default=1000)
+#    --skip-embedding               - skips analysis of embedding tables
+#    --umap-metric                  - metric for UMAP 
+#    --skip-data-plots              - skips data plots
+#    --skip-categorical-analysis    - skips categorical analysis
+# 
+#    # data file related
+#    --raw-data-file
+#    --processed-data-file
+#    --data-sub-sample-rate
+#    --data-randomize
+#    --memory-map
+#    --mini-batch-size
+#    --num-workers
+#    --test-mini-batch-size
+#    --test-num-workers
+#    --num-batches    
+#    --mlperf-logging
+
+import os
+import sys
+import argparse
+import numpy as np
+import umap
+import hdbscan
+import json
+import torch
+import math
+import matplotlib
+import matplotlib.pyplot as plt
+import collections
+
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import f1_score
+from sklearn.metrics import precision_score
+from sklearn.metrics import recall_score
+
+from sklearn import manifold
+
+import dlrm_data_pytorch as dp
+from dlrm_s_pytorch import DLRM_Net
+
+
+def visualize_embeddings_umap(emb_l, 
+                              output_dir    = "",
+                              max_size      = 500000, 
+                              umap_metric   = "euclidean",
+                              cat_counts    = None,
+                              use_max_count = True):
+
+    for k in range(0, len(emb_l)):
+
+        E = emb_l[k].weight.detach().cpu().numpy()
+        print("umap", E.shape)
+
+        # create histogram of norms
+        bins = 50
+        norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])]
+#        plt.hist(norms, bins = bins)
+#        plt.title("Cat norm hist var. "+str(k))
+        hist, bins = np.histogram(norms, bins=bins)
+        logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),len(bins))
+
+        plt.figure(figsize=(8,8))
+        plt.title("Categorical norms: " + str(k) + " cardinality " + str(len(cat_counts[k])))
+        plt.hist(norms, bins=logbins)
+        plt.xscale("log")
+#        plt.legend()
+        plt.savefig(output_dir+"/cat-norm-histogram-"+str(k)+".png")
+        plt.close()
+
+        if E.shape[0] < 20:
+            print("Skipping small embedding")
+            continue
+
+        n_vis = min(max_size, E.shape[0])
+        min_cnt = 0
+        
+#        reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1)
+        reducer = umap.UMAP(random_state=42, metric=umap_metric)
+        
+        if use_max_count is False or n_vis == E.shape[0]:
+            Y = reducer.fit_transform(E[:n_vis,:])
+        else:
+            
+            # select values with couns > 1
+            done  = False
+            min_cnt = 1
+            while done == False:
+                el_cnt = (cat_counts[k] > min_cnt).sum()
+                if el_cnt <= max_size:
+                    done = True
+                else:
+                    min_cnt = min_cnt+1
+           
+            E1= []
+            for i in range(0, E.shape[0]):
+                if cat_counts[k][i] > min_cnt:
+                    E1.append(E[i,:])
+            
+            print("max_count_len", len(E1), "mincount", min_cnt)
+            Y = reducer.fit_transform(np.array(E1))
+
+            n_vis = len(E1)
+
+        plt.figure(figsize=(8,8))
+        
+        linewidth = 0
+        size      = 1
+        
+        if Y.shape[0] < 2500:
+            linewidth = 1 
+            size      = 5
+
+        if cat_counts is None:
+            plt.scatter(-Y[:,0], -Y[:,1], s=size, marker=".", linewidth=linewidth)
+        else:
+            #print(cat_counts[k])
+            n_disp = min(len(cat_counts[k]), Y.shape[0])
+            cur_max = math.log(max(cat_counts[k]))
+            norm_cat_count = [math.log(cat_counts[k][i]+1)/cur_max for i in range(0, len(cat_counts[k]))]
+            plt.scatter(-Y[0:n_disp,0], -Y[0:n_disp,1], s=size, marker=".", linewidth=linewidth, c=np.array(norm_cat_count)[0:n_disp], cmap="viridis")
+            plt.colorbar()
+            
+        plt.title("UMAP: categorical var. " + str(k) + "  (" + str(n_vis) + " of " + str(E.shape[0]) + ", min count " + str(min_cnt) + ")")
+        plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-umap.png")
+        plt.close()
+
+
+def visualize_embeddings_tsne(emb_l, 
+                              output_dir = "",
+                              max_size   = 10000):
+
+    for k in range(0, len(emb_l)):
+
+        E = emb_l[k].weight.detach().cpu()    
+        print("tsne", E.shape)
+
+        if E.shape[0] < 20:
+            print("Skipping small embedding")
+            continue
+
+        n_vis = min(max_size, E.shape[0])
+        
+        tsne = manifold.TSNE(init="pca", random_state=0, method="exact")
+    
+        Y = tsne.fit_transform(E[:n_vis,:])
+
+        plt.figure(figsize=(8, 8))
+
+        linewidth = 0
+        if Y.shape[0] < 5000:
+            linewidth = 1 
+
+        plt.scatter(-Y[:,0], -Y[:,1], s=1, marker=".", linewidth=linewidth)
+        
+        plt.title("TSNE: categorical var. " + str(k) + "  (" + str(n_vis) + " of " + str(E.shape[0]) + ")")
+        plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-tsne.png")
+        plt.close()
+
+
+def analyse_categorical_data(X_cat, n_days=10, output_dir=""):
+
+    # analyse categorical variables
+    n_vec = len(X_cat)
+    n_cat = len(X_cat[0])
+    n_days = n_days
+    
+    print("n_vec", n_vec, "n_cat", n_cat)
+#    for c in train_data.X_cat:
+#        print(n_cat, c)
+
+    all_cat = np.array(X_cat)
+    print("all_cat.shape", all_cat.shape)
+    day_size = all_cat.shape[0]/n_days
+
+    for i in range(0,n_cat):
+        l_d   = []
+        l_s1  = []
+        l_s2  = []
+        l_int = []
+        l_rem = []
+
+        cat = all_cat[:,i]
+        print("cat", i, cat.shape)
+        for d in range(1,n_days):
+            offset = int(d*day_size)
+            #print(offset)
+            cat1 = cat[:offset]
+            cat2 = cat[offset:]
+
+            s1 = set(cat1)
+            s2 = set(cat2)
+
+            intersect = list(s1 & s2) 
+            #print(intersect)
+            l_d.append(d)
+            l_s1.append(len(s1))
+            l_s2.append(len(s2))
+            l_int.append(len(intersect))
+            l_rem.append((len(s1)-len(intersect)))
+
+            print(d, ",", len(s1), ",", len(s2), ",", len(intersect), ",", (len(s1)-len(intersect)))
+
+        print("spit",    l_d)
+        print("before",  l_s1)
+        print("after",   l_s2)
+        print("inters.", l_int)
+        print("removed", l_rem)
+
+        plt.figure(figsize=(8,8))
+        plt.plot(l_d, l_s1,  "g", label="before")
+        plt.plot(l_d, l_s2,  "r", label="after")
+        plt.plot(l_d, l_int, "b", label="intersect")
+        plt.plot(l_d, l_rem, "y", label="removed")
+        plt.title("categorical var. "+str(i))
+        plt.legend()
+        plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png")
+        plt.close()
+
+
+def analyse_categorical_counts(X_cat, emb_l=None, output_dir=""):
+
+    # analyse categorical variables
+    n_vec = len(X_cat)
+    n_cat = len(X_cat[0])
+    
+    print("n_vec", n_vec, "n_cat", n_cat)
+#    for c in train_data.X_cat:
+#        print(n_cat, c)
+
+    all_cat = np.array(X_cat)
+    print("all_cat.shape", all_cat.shape)
+
+    all_counts = []
+
+    for i in range(0,n_cat):
+        
+        cat = all_cat[:,i]
+        if emb_l is None:
+            s      = set(cat)
+            counts = np.zeros((len(s)))
+            print("cat", i, cat.shape, len(s))
+        else:
+            s = emb_l[i].weight.detach().cpu().shape[0]
+            counts = np.zeros((s))
+            print("cat", i, cat.shape, s)
+
+        for d in range(0,n_vec):
+            cv = int(cat[d])
+            counts[cv] = counts[cv]+1
+
+        all_counts.append(counts)
+
+        if emb_l is None:
+            plt.figure(figsize=(8,8))
+            plt.plot(counts)
+            plt.title("Categorical var "+str(i) + " cardinality " + str(len(counts)))
+            #        plt.legend()
+        else:
+            E = emb_l[i].weight.detach().cpu().numpy()
+            norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])]
+
+            fig, (ax0, ax1) = plt.subplots(2, 1)
+            fig.suptitle("Categorical variable: " + str(i)+" cardinality "+str(len(counts)))
+
+            ax0.plot(counts)
+            ax0.set_yscale("log")
+            ax0.set_title("Counts", fontsize=10)
+    
+            ax1.plot(norms)
+            ax1.set_title("Norms", fontsize=10)
+
+        plt.savefig(output_dir+"/cat_counts-"+str(i).zfill(3)+".png")
+        plt.close()
+    
+    return all_counts
+    
+
+def dlrm_output_wrap(dlrm, X, lS_o, lS_i, T):
+
+    all_feat_vec = []
+    all_cat_vec  = []
+    x_vec        = None
+    t_out        = None
+    c_out        = None
+    z_out        = []
+    p_out        = None
+
+    z_size = len(dlrm.top_l)
+
+    x = dlrm.apply_mlp(X, dlrm.bot_l)
+    # debug prints
+    #print("intermediate")
+    #print(x[0].detach().cpu().numpy())
+    x_vec = x[0].detach().cpu().numpy()
+    all_feat_vec.append(x_vec)
+#    all_X.append(x[0].detach().cpu().numpy())
+
+    # process sparse features(using embeddings), resulting in a list of row vectors
+    ly = dlrm.apply_emb(lS_o, lS_i, dlrm.emb_l)
+
+    for e in ly:
+        #print(e.detach().cpu().numpy())
+        all_feat_vec.append(e[0].detach().cpu().numpy())
+        all_cat_vec.append(e[0].detach().cpu().numpy())
+
+    all_feat_vec= np.concatenate(all_feat_vec, axis=0)
+    all_cat_vec= np.concatenate(all_cat_vec, axis=0)
+
+#    all_features.append(all_feat_vec)
+#    all_cat.append(all_cat_vec)
+    t_out = int(T.detach().cpu().numpy()[0,0])
+#    all_T.append(int(T.detach().cpu().numpy()[0,0]))
+
+    z = dlrm.interact_features(x, ly)
+    # print(z.detach().cpu().numpy())
+#    z_out = z.detach().cpu().numpy().flatten()
+    z_out.append(z.detach().cpu().numpy().flatten())
+#    all_z[0].append(z.detach().cpu().numpy().flatten())
+
+        # obtain probability of a click (using top mlp)
+#        print(dlrm.top_l)
+#        p = dlrm.apply_mlp(z, dlrm.top_l)
+
+    for i in range(0, z_size):
+        z = dlrm.top_l[i](z)
+
+#        if i < z_size-1:
+#            curr_z = z.detach().cpu().numpy().flatten()
+        z_out.append(z.detach().cpu().numpy().flatten())
+#            all_z[i+1].append(curr_z)
+#            print("z append", i)
+            
+#        print("z",i, z.detach().cpu().numpy().flatten().shape)
+
+    p = z
+
+    # clamp output if needed
+    if 0.0 < dlrm.loss_threshold and dlrm.loss_threshold < 1.0:
+        z = torch.clamp(p, min=dlrm.loss_threshold, max=(1.0 - dlrm.loss_threshold))
+    else:
+        z = p
+
+    class_thresh = 0.0 #-0.25
+    zp = z.detach().cpu().numpy()[0,0]+ class_thresh
+    
+    p_out = int(zp+0.5)
+    if p_out > 1:
+        p_out = 1
+    if p_out < 0:
+        p_out = 0
+
+#    all_pred.append(int(z.detach().cpu().numpy()[0,0]+0.5))
+
+    #print(int(z.detach().cpu().numpy()[0,0]+0.5))
+    if int(p_out) == t_out:
+        c_out = 0
+    else:
+        c_out = 1
+
+    return all_feat_vec, x_vec, all_cat_vec, t_out, c_out, z_out, p_out
+
+
+def create_umap_data(dlrm, data_ld, max_size=50000, offset=0,  info=""):
+    
+    all_features = []
+    all_X        = []
+    all_cat      = []
+    all_T        = []
+    all_c        = []
+    all_z        = []
+    all_pred     = []
+    
+    z_size = len(dlrm.top_l)
+    print("z_size", z_size)
+    for i in range(0, z_size):
+        all_z.append([])
+    
+    for j, (X, lS_o, lS_i, T) in enumerate(data_ld):
+
+        if j < offset:
+            continue
+        
+        if j >= max_size+offset:
+            break
+        
+        af, x, cat, t, c, z, p = dlrm_output_wrap(dlrm, X, lS_o, lS_i, T)
+       
+        all_features.append(af)
+        all_X.append(x)
+        all_cat.append(cat)
+        all_T.append(t)
+        all_c.append(c)
+        all_pred.append(p)
+        
+        for i in range(0, z_size):
+            all_z[i].append(z[i])
+
+#    # calculate classifier metrics 
+    ac = accuracy_score(all_T, all_pred)
+    f1 = f1_score(all_T, all_pred)
+    ps = precision_score(all_T, all_pred)
+    rc = recall_score(all_T, all_pred)
+
+    print(info, "accuracy", ac, "f1", f1, "precision", ps, "recall", rc)
+
+    return all_features, all_X, all_cat, all_T, all_z, all_c, all_pred
+
+
+def plot_all_data_3(umap_Y,
+                    umap_T,
+                    train_Y          = None, 
+                    train_T          = None, 
+                    test_Y           = None, 
+                    test_T           = None, 
+                    total_train_size = "", 
+                    total_test_size  = "", 
+                    info             = "",
+                    output_dir       = "",
+                    orig_space_dim   = 0):
+    
+    size = 1
+    colors = ["red","green"]
+
+    fig, (ax0, ax1, ax2) = plt.subplots(1, 3)
+    fig.suptitle("UMAP: " + info + " space dim "+str(orig_space_dim))
+
+    ax0.scatter(umap_Y[:,0], umap_Y[:,1], s=size, c=umap_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0)
+    ax0.set_title("UMAP ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7)
+    
+    if train_Y is not None and train_T is not None:
+        ax1.scatter(train_Y[:,0], train_Y[:,1], s=size, c=train_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0)
+        ax1.set_title("Train ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7)
+
+    if test_Y is not None and test_T is not None:
+        ax2.scatter(test_Y[:,0], test_Y[:,1], s=size, c=test_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0)
+        ax2.set_title("Test ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7)
+
+    plt.savefig(output_dir+"/"+info+"-umap.png")
+    plt.close()
+
+
+def plot_one_class_3(umap_Y,
+                     umap_T,
+                     train_Y,
+                     train_T,
+                     test_Y, 
+                     test_T, 
+                     target           = 0, 
+                     col              = "red", 
+                     total_train_size = "", 
+                     total_test_size  = "", 
+                     info             = "",
+                     output_dir       = "",
+                     orig_space_dim   = 0):
+    
+    size = 1
+    
+    fig, (ax0, ax1, ax2) = plt.subplots(1, 3)
+    fig.suptitle("UMAP: "+ info + " space dim "+str(orig_space_dim))
+
+    ind_l_umap     = [i for i,x in enumerate(umap_T) if x == target]
+    Y_umap_l       = np.array([umap_Y[i,:] for i in ind_l_umap])
+
+    ax0.scatter(Y_umap_l[:,0], Y_umap_l[:,1], s=size, c=col, marker=".", linewidth=0)
+    ax0.set_title("UMAP, ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7)
+    
+    if train_Y is not None and train_T is not None:
+        ind_l_test = [i for i,x in enumerate(train_T) if x == target]
+        Y_test_l   = np.array([train_Y[i,:] for i in ind_l_test])
+        
+        ax1.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0)
+        ax1.set_title("Train, ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7)
+
+    if test_Y is not None and test_T is not None:
+        ind_l_test = [i for i,x in enumerate(test_T) if x == target]
+        Y_test_l   = np.array([test_Y[i,:] for i in ind_l_test])
+
+        ax2.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0)
+        ax2.set_title("Test, ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7)
+
+    plt.savefig(output_dir+"/"+info+"-umap.png")
+    plt.close()
+
+
+def visualize_umap_data(umap_Y,
+                        umap_T,
+                        umap_C,
+                        umap_P,
+                        train_Y, 
+                        train_T, 
+                        train_C,
+                        train_P,
+                        test_Y           = None,
+                        test_T           = None, 
+                        test_C           = None,
+                        test_P           = None,
+                        total_train_size = "", 
+                        total_test_size  = "",  
+                        info             = "",
+                        output_dir       = "",
+                        orig_space_dim   = 0):
+
+    # all classes
+    plot_all_data_3(umap_Y           = umap_Y,
+                    umap_T           = umap_T,
+                    train_Y          = train_Y,
+                    train_T          = train_T, 
+                    test_Y           = test_Y, 
+                    test_T           = test_T, 
+                    total_train_size = total_train_size,
+                    total_test_size  = total_test_size,
+                    info             = info,
+                    output_dir       = output_dir,
+                    orig_space_dim   = orig_space_dim)
+
+    # all predictions
+    plot_all_data_3(umap_Y           = umap_Y,
+                    umap_T           = umap_P,
+                    train_Y          = train_Y,
+                    train_T          = train_P, 
+                    test_Y           = test_Y, 
+                    test_T           = test_P, 
+                    total_train_size = total_train_size,
+                    total_test_size  = total_test_size,
+                    info             = info+", all-predictions",
+                    output_dir       = output_dir,
+                    orig_space_dim   = orig_space_dim)
+
+    
+    # class 0
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_T,
+                     train_Y          = train_Y,
+                     train_T          = train_T,
+                     test_Y           = test_Y, 
+                     test_T           = test_T, 
+                     target           = 0, 
+                     col              = "red", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info+" class " + str(0),
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # class 1
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_T,
+                     train_Y          = train_Y,
+                     train_T          = train_T,
+                     test_Y           = test_Y, 
+                     test_T           = test_T, 
+                     target           = 1, 
+                     col              = "green", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " class " + str(1),
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # correct classification
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_C,
+                     train_Y          = train_Y,
+                     train_T          = train_C,
+                     test_Y           = test_Y, 
+                     test_T           = test_C, 
+                     target           = 0, 
+                     col              = "green", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " correct ",
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # errors
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_C,
+                     train_Y          = train_Y,
+                     train_T          = train_C,
+                     test_Y           = test_Y, 
+                     test_T           = test_C, 
+                     target           = 1, 
+                     col              = "red", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " errors ",
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # prediction 0
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_P,
+                     train_Y          = train_Y,
+                     train_T          = train_P,
+                     test_Y           = test_Y, 
+                     test_T           = test_P, 
+                     target           = 0, 
+                     col              = "red", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " predict-0 ",
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # prediction 1
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_P,
+                     train_Y          = train_Y,
+                     train_T          = train_P,
+                     test_Y           = test_Y, 
+                     test_T           = test_P, 
+                     target           = 1, 
+                     col              = "green", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " predict-1 ",
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+def hdbscan_clustering(umap_data, train_data, test_data, info="", output_dir=""):
+
+    clusterer       = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500, prediction_data=True)
+    umap_labels     = clusterer.fit_predict(umap_data)
+    train_labels, _ = hdbscan.approximate_predict(clusterer, train_data)
+    test_labels,  _ = hdbscan.approximate_predict(clusterer, test_data)
+
+    fig, ((ax00, ax01, ax02), (ax10, ax11, ax12)) = plt.subplots(2, 3)
+    fig.suptitle("HDBSCAN clastering: "+ info )
+
+    # plot umap data
+    umap_clustered = (umap_labels >= 0)
+    umap_coll = collections.Counter(umap_clustered)
+    print("umap_clustered", umap_coll)
+#    print("umap_data", umap_data.shape)
+#    print("~umap_clustered", umap_clustered.count(False), ~umap_clustered)
+    ax00.scatter(umap_data[~umap_clustered, 0],
+                 umap_data[~umap_clustered, 1],
+                 c=(0.5, 0.5, 0.5),
+                 s=0.1,
+                 alpha=0.5)
+    ax00.set_title("UMAP Outliers " + str(umap_coll[False]), fontsize=7)
+    ax10.scatter(umap_data[umap_clustered, 0],
+                 umap_data[umap_clustered, 1],
+                 c=umap_labels[umap_clustered],
+                 s=0.1,
+                 cmap="Spectral")
+    ax10.set_title("UMAP Inliers " + str(umap_coll[True]), fontsize=7)
+    
+    # plot train data
+    train_clustered = (train_labels >= 0)
+    train_coll = collections.Counter(train_clustered)
+    ax01.scatter(train_data[~train_clustered, 0],
+                 train_data[~train_clustered, 1],
+                 c=(0.5, 0.5, 0.5),
+                 s=0.1,
+                 alpha=0.5)
+    ax01.set_title("Train Outliers " + str(train_coll[False]), fontsize=7)
+    ax11.scatter(train_data[train_clustered, 0],
+                 train_data[train_clustered, 1],
+                 c=train_labels[train_clustered],
+                 s=0.1,
+                 cmap="Spectral")
+    ax11.set_title("Train Inliers " + str(train_coll[True]), fontsize=7)
+    
+    # plot test data
+    test_clustered = (test_labels >= 0)
+    test_coll = collections.Counter(test_clustered)
+    ax02.scatter(test_data[~test_clustered, 0],
+                 test_data[~test_clustered, 1],
+                 c=(0.5, 0.5, 0.5),
+                 s=0.1,
+                 alpha=0.5)
+    ax02.set_title("Tets Outliers " + str(test_coll[False]), fontsize=7)
+    ax12.scatter(test_data[test_clustered, 0],
+                 test_data[test_clustered, 1],
+                 c=test_labels[test_clustered],
+                 s=0.1,
+                 cmap="Spectral")
+    ax12.set_title("Test Inliers " + str(test_coll[True]), fontsize=7)
+    
+    plt.savefig(output_dir+"/"+info+"-hdbscan.png")
+    plt.close()
+
+
+def visualize_all_data_umap(dlrm, 
+                            train_ld, 
+                            test_ld       = None, 
+                            max_umap_size = 50000,
+                            output_dir    = "",
+                            umap_metric   = "euclidean"):
+
+    data_ratio = 1
+    
+    print("creating umap data")
+    umap_train_feat, umap_train_X, umap_train_cat, umap_train_T, umap_train_z, umap_train_c, umap_train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size, offset=0, info="umap")
+    
+    # transform train and test data
+    train_feat, train_X, train_cat, train_T, train_z, train_c, train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size*data_ratio, offset=max_umap_size, info="train")
+    test_feat,  test_X,  test_cat,  test_T,  test_z,  test_c,  test_p  = create_umap_data(dlrm=dlrm, data_ld=test_ld,  max_size=max_umap_size*data_ratio, offset=0,             info="test")
+
+    print("umap_train_feat", np.array(umap_train_feat).shape)
+    reducer_all_feat = umap.UMAP(random_state=42, metric=umap_metric)
+    umap_feat_Y = reducer_all_feat.fit_transform(umap_train_feat)
+
+    train_feat_Y = reducer_all_feat.transform(train_feat)
+    test_feat_Y  = reducer_all_feat.transform(test_feat)
+    
+    visualize_umap_data(umap_Y           = umap_feat_Y,
+                        umap_T           = umap_train_T,
+                        umap_C           = umap_train_c,
+                        umap_P           = umap_train_p,
+                        train_Y          = train_feat_Y, 
+                        train_T          = train_T, 
+                        train_C          = train_c,
+                        train_P          = train_p,
+                        test_Y           = test_feat_Y,
+                        test_T           = test_T, 
+                        test_C           = test_c,
+                        test_P           = test_p,
+                        total_train_size = str(len(train_ld)), 
+                        total_test_size  = str(len(test_ld)), 
+                        info             = "all-features",
+                        output_dir       = output_dir,
+                        orig_space_dim   = np.array(umap_train_feat).shape[1])
+
+    hdbscan_clustering(umap_data  = umap_feat_Y, 
+                       train_data = train_feat_Y, 
+                       test_data  = test_feat_Y, 
+                       info       = "umap-all-features", 
+                       output_dir = output_dir)
+
+#    hdbscan_clustering(umap_data  = np.array(umap_train_feat), 
+#                       train_data = np.array(train_feat), 
+#                       test_data  = np.array(test_feat), 
+#                       info       = "all-features", 
+#                       output_dir = output_dir)
+
+    print("umap_train_X", np.array(umap_train_X).shape)
+    reducer_X = umap.UMAP(random_state=42, metric=umap_metric)
+    umap_X_Y = reducer_X.fit_transform(umap_train_X)
+
+    train_X_Y = reducer_X.transform(train_X)
+    test_X_Y  = reducer_X.transform(test_X)
+
+    visualize_umap_data(umap_Y           = umap_X_Y,
+                        umap_T           = umap_train_T,
+                        umap_C           = umap_train_c,
+                        umap_P           = umap_train_p,
+                        train_Y          = train_X_Y, 
+                        train_T          = train_T, 
+                        train_C          = train_c,
+                        train_P          = train_p,
+                        test_Y           = test_X_Y,
+                        test_T           = test_T, 
+                        test_C           = test_c,
+                        test_P           = test_p,
+                        total_train_size = str(len(train_ld)), 
+                        total_test_size  = str(len(test_ld)), 
+                        info             = "cont-features",
+                        output_dir       = output_dir,
+                        orig_space_dim   = np.array(umap_train_X).shape[1])
+
+    print("umap_train_cat", np.array(umap_train_cat).shape)
+    reducer_cat = umap.UMAP(random_state=42, metric=umap_metric)
+    umap_cat_Y = reducer_cat.fit_transform(umap_train_cat)
+
+    train_cat_Y = reducer_cat.transform(train_cat)
+    test_cat_Y  = reducer_cat.transform(test_cat)
+
+    visualize_umap_data(umap_Y           = umap_cat_Y,
+                        umap_T           = umap_train_T,
+                        umap_C           = umap_train_c,
+                        umap_P           = umap_train_p,
+                        train_Y          = train_cat_Y, 
+                        train_T          = train_T, 
+                        train_C          = train_c,
+                        train_P          = train_p,
+                        test_Y           = test_cat_Y,
+                        test_T           = test_T, 
+                        test_C           = test_c,
+                        test_P           = test_p,
+                        total_train_size = str(len(train_ld)), 
+                        total_test_size  = str(len(test_ld)), 
+                        info             = "cat-features",
+                        output_dir       = output_dir,
+                        orig_space_dim   = np.array(umap_train_cat).shape[1])
+
+    # UMAP for z data
+    for i in range(0,len(umap_train_z)):
+        print("z", i, np.array(umap_train_z[i]).shape)
+        reducer_z = umap.UMAP(random_state=42, metric=umap_metric)
+        umap_z_Y = reducer_z.fit_transform(umap_train_z[i])
+
+        train_z_Y = reducer_z.transform(train_z[i])
+        test_z_Y  = reducer_z.transform(test_z[i])
+
+        visualize_umap_data(umap_Y           = umap_z_Y,
+                            umap_T           = umap_train_T,
+                            umap_C           = umap_train_c,
+                            umap_P           = umap_train_p,
+                            train_Y          = train_z_Y, 
+                            train_T          = train_T, 
+                            train_C          = train_c,
+                            train_P          = train_p,
+                            test_Y           = test_z_Y,
+                            test_T           = test_T, 
+                            test_C           = test_c,
+                            test_P           = test_p,
+                            total_train_size = str(len(train_ld)), 
+                            total_test_size  = str(len(test_ld)), 
+                            info             = "z-features-"+str(i),
+                            output_dir       = output_dir,
+                            orig_space_dim   = np.array(umap_train_z[i]).shape[1])
+
+
+def analyze_model_data(output_dir,
+                       dlrm,
+                       train_ld,
+                       test_ld,
+                       train_data,
+                       skip_embedding            = False,
+                       use_tsne                  = False,
+                       max_umap_size             = 50000,
+                       max_tsne_size             = 10000,
+                       skip_categorical_analysis = False,
+                       skip_data_plots           = False,
+                       umap_metric               = "euclidean"):
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    if skip_embedding is False:
+
+        cat_counts = None
+        
+        cat_counts = analyse_categorical_counts(X_cat=train_data.X_cat, emb_l=dlrm.emb_l, output_dir=output_dir)
+
+        visualize_embeddings_umap(emb_l       = dlrm.emb_l,
+                                  output_dir  = output_dir,
+                                  max_size    = max_umap_size,
+                                  umap_metric = umap_metric,
+                                  cat_counts  = cat_counts)
+
+        if use_tsne is True:
+            visualize_embeddings_tsne(emb_l      = dlrm.emb_l,
+                                      output_dir = output_dir,
+                                      max_size   = max_tsne_size)
+
+    # data visualization and analysis
+    if skip_data_plots is False:
+        visualize_all_data_umap(dlrm=dlrm, train_ld=train_ld, test_ld=test_ld, max_umap_size=max_umap_size, output_dir=output_dir, umap_metric=umap_metric)
+
+    # analyse categorical variables
+    if skip_categorical_analysis is False and args.data_randomize == "none":
+        analyse_categorical_data(X_cat=train_data.X_cat, n_days=10, output_dir=output_dir)
+
+
+
+if __name__ == "__main__":
+
+    output_dir = ""
+    
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Exploratory DLRM analysis"
+    )
+
+    parser.add_argument("--load-model", type=str, default="")
+    parser.add_argument("--data-set", choices=["kaggle", "terabyte"], help="dataset")
+#    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+#    parser.add_argument("--mlperf-bin-loader", action="store_true", default=False)
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--skip-embedding", action="store_true", default=False)
+    parser.add_argument("--umap-metric", type=str, default="euclidean")
+    parser.add_argument("--skip-data-plots", action="store_true", default=False)
+    parser.add_argument("--skip-categorical-analysis", action="store_true", default=False)
+    
+    # umap relatet
+    parser.add_argument("--max-umap-size", type=int, default=50000)
+    # tsne related
+    parser.add_argument("--use-tsne", action="store_true", default=False)
+    parser.add_argument("--max-tsne-size", type=int, default=1000)
+    # data file related
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # none, total or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--test-mini-batch-size", type=int, default=1)
+    parser.add_argument("--test-num-workers", type=int, default=0)
+    parser.add_argument("--num-batches", type=int, default=0)    
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+
+    args = parser.parse_args()
+
+    print("command line args: ", json.dumps(vars(args)))
+
+    if output_dir == "":
+        output_dir = args.data_set+"-"+os.path.split(args.load_model)[-1]+"-vis_all"
+    print("output_dir:", output_dir)
+    
+    if args.data_set == "kaggle":
+        # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh)
+        m_spa=16
+        ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572])
+        ln_bot=np.array([13,512,256,64,16])
+        ln_top=np.array([367,512,256,1])
+        
+    elif args.dataset == "terabyte":
+
+        if args.max_ind_range == 10000000:
+            # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000)
+            m_spa=64
+            ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36])
+            ln_bot=np.array([13,512,256,64])
+            ln_top=np.array([415,512,512,256,1])
+        elif args.max_ind_range == 40000000:
+            # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000)
+            m_spa=128
+            ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36])
+            ln_bot=np.array([13,512,256,128])
+            ln_top=np.array([479,1024,1024,512,256,1])
+        else:
+            raise ValueError("only --max-in-range 10M or 40M is supported")
+    else:
+        raise ValueError("only kaggle|terabyte dataset options are supported")
+
+    # check input parameters
+    if args.data_randomize != "none" and args.skip_categorical_analysis is not True:
+        print("Incorrect option for categoricat analysis, use:  --data-randomize=none")
+        sys.exit(-1)
+
+    dlrm = DLRM_Net(
+            m_spa,
+            ln_emb,
+            ln_bot,
+            ln_top,
+            arch_interaction_op="dot",
+            arch_interaction_itself=False,
+            sigmoid_bot=-1,
+            sigmoid_top=ln_top.size - 2,
+            sync_dense_params=True,
+            loss_threshold=0.0,
+            ndevices=-1,
+            qr_flag=False,
+            qr_operation=None,
+            qr_collisions=None,
+            qr_threshold=None,
+            md_flag=False,
+            md_threshold=None,
+        )
+
+    # Load model is specified
+    if not (args.load_model == ""):
+        print("Loading saved model {}".format(args.load_model))
+
+        ld_model = torch.load(args.load_model, map_location=torch.device("cpu"))
+        dlrm.load_state_dict(ld_model["state_dict"])
+
+        print("Model loaded", args.load_model)
+        #print(dlrm)
+
+    z_size = len(dlrm.top_l)
+    for i in range(0, z_size):
+         print("z", i, dlrm.top_l[i])
+
+    # load data
+    train_data = None
+    test_data  = None
+    
+    if args.raw_data_file is not "" or args.processed_data_file is not "":
+        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
+
+    analyze_model_data(output_dir                = output_dir,
+                       dlrm                      = dlrm,
+                       train_ld                  = train_ld,
+                       test_ld                   = test_ld,
+                       train_data                = train_data,
+                       skip_embedding            = args.skip_embedding,
+                       use_tsne                  = args.use_tsne,
+                       max_umap_size             = args.max_umap_size,
+                       max_tsne_size             = args.max_tsne_size,
+                       skip_categorical_analysis = args.skip_categorical_analysis,
+                       skip_data_plots           = args.skip_data_plots,
+                       umap_metric               = args.umap_metric)
+
diff --git a/benchmarks/dlrm/ootb/tricks/md_embedding_bag.py b/benchmarks/dlrm/ootb/tricks/md_embedding_bag.py
new file mode 100644
index 0000000..7c4071a
--- /dev/null
+++ b/benchmarks/dlrm/ootb/tricks/md_embedding_bag.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Mixed-Dimensions Trick
+#
+# Description: Applies mixed dimension trick to embeddings to reduce
+# embedding sizes.
+#
+# References:
+# [1] Antonio Ginart, Maxim Naumov, Dheevatsa Mudigere, Jiyan Yang, James Zou,
+# "Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation
+# Systems", CoRR, arXiv:1909.11810, 2019
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch
+import torch.nn as nn
+
+
+def md_solver(n, alpha, d0=None, B=None, round_dim=True, k=None):
+    '''
+    An external facing function call for mixed-dimension assignment
+    with the alpha power temperature heuristic
+    Inputs:
+    n -- (torch.LongTensor) ; Vector of num of rows for each embedding matrix
+    alpha -- (torch.FloatTensor); Scalar, non-negative, controls dim. skew
+    d0 -- (torch.FloatTensor); Scalar, baseline embedding dimension
+    B -- (torch.FloatTensor); Scalar, parameter budget for embedding layer
+    round_dim -- (bool); flag for rounding dims to nearest pow of 2
+    k -- (torch.LongTensor) ; Vector of average number of queries per inference
+    '''
+    n, indices = torch.sort(n)
+    k = k[indices] if k is not None else torch.ones(len(n))
+    d = alpha_power_rule(n.type(torch.float) / k, alpha, d0=d0, B=B)
+    if round_dim:
+        d = pow_2_round(d)
+    undo_sort = [0] * len(indices)
+    for i, v in enumerate(indices):
+        undo_sort[v] = i
+    return d[undo_sort]
+
+
+def alpha_power_rule(n, alpha, d0=None, B=None):
+    if d0 is not None:
+        lamb = d0 * (n[0].type(torch.float) ** alpha)
+    elif B is not None:
+        lamb = B / torch.sum(n.type(torch.float) ** (1 - alpha))
+    else:
+        raise ValueError("Must specify either d0 or B")
+    d = torch.ones(len(n)) * lamb * (n.type(torch.float) ** (-alpha))
+    for i in range(len(d)):
+        if i == 0 and d0 is not None:
+            d[i] = d0
+        else:
+            d[i] = 1 if d[i] < 1 else d[i]
+    return (torch.round(d).type(torch.long))
+
+
+def pow_2_round(dims):
+    return 2 ** torch.round(torch.log2(dims.type(torch.float)))
+
+
+class PrEmbeddingBag(nn.Module):
+    def __init__(self, num_embeddings, embedding_dim, base_dim):
+        super(PrEmbeddingBag, self).__init__()
+        self.embs = nn.EmbeddingBag(
+            num_embeddings, embedding_dim, mode="sum", sparse=True)
+        torch.nn.init.xavier_uniform_(self.embs.weight)
+        if embedding_dim < base_dim:
+            self.proj = nn.Linear(embedding_dim, base_dim, bias=False)
+            torch.nn.init.xavier_uniform_(self.proj.weight)
+        elif embedding_dim == base_dim:
+            self.proj = nn.Identity()
+        else:
+            raise ValueError(
+                "Embedding dim " + str(embedding_dim) + " > base dim " + str(base_dim)
+            )
+
+    def forward(self, input, offsets=None, per_sample_weights=None):
+        return self.proj(self.embs(
+            input, offsets=offsets, per_sample_weights=per_sample_weights))
diff --git a/benchmarks/dlrm/ootb/tricks/qr_embedding_bag.py b/benchmarks/dlrm/ootb/tricks/qr_embedding_bag.py
new file mode 100644
index 0000000..290d795
--- /dev/null
+++ b/benchmarks/dlrm/ootb/tricks/qr_embedding_bag.py
@@ -0,0 +1,185 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Quotient-Remainder Trick
+#
+# Description: Applies quotient remainder-trick to embeddings to reduce
+# embedding sizes.
+#
+# References:
+# [1] Hao-Jun Michael Shi, Dheevatsa Mudigere, Maxim Naumov, Jiyan Yang,
+# "Compositional Embeddings Using Complementary Partitions for Memory-Efficient
+# Recommendation Systems", CoRR, arXiv:1909.02107, 2019
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+import numpy as np
+
+
+class QREmbeddingBag(nn.Module):
+    r"""Computes sums or means over two 'bags' of embeddings, one using the quotient
+    of the indices and the other using the remainder of the indices, without
+    instantiating the intermediate embeddings, then performs an operation to combine these.
+
+    For bags of constant length and no :attr:`per_sample_weights`, this class
+
+        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``,
+        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``,
+        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``.
+
+    However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
+    operations.
+
+    QREmbeddingBag also supports per-sample weights as an argument to the forward
+    pass. This scales the output of the Embedding before performing a weighted
+    reduction as specified by ``mode``. If :attr:`per_sample_weights`` is passed, the
+    only supported ``mode`` is ``"sum"``, which computes a weighted sum according to
+    :attr:`per_sample_weights`.
+
+    Known Issues:
+    Autograd breaks with multiple GPUs. It breaks only with multiple embeddings.
+
+    Args:
+        num_categories (int): total number of unique categories. The input indices must be in
+                              0, 1, ..., num_categories - 1.
+        embedding_dim (list): list of sizes for each embedding vector in each table. If ``"add"``
+                              or ``"mult"`` operation are used, these embedding dimensions must be
+                              the same. If a single embedding_dim is used, then it will use this
+                              embedding_dim for both embedding tables.
+        num_collisions (int): number of collisions to enforce.
+        operation (string, optional): ``"concat"``, ``"add"``, or ``"mult". Specifies the operation
+                                      to compose embeddings. ``"concat"`` concatenates the embeddings,
+                                      ``"add"`` sums the embeddings, and ``"mult"`` multiplies
+                                      (component-wise) the embeddings.
+                                      Default: ``"mult"``
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
+        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+                                                Note: this option is not supported when ``mode="max"``.
+        mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
+                                 ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights`
+                                 into consideration. ``"mean"`` computes the average of the values
+                                 in the bag, ``"max"`` computes the max value over each bag.
+                                 Default: ``"mean"``
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
+                                 Notes for more details regarding sparse gradients. Note: this option is not
+                                 supported when ``mode="max"``.
+
+    Attributes:
+        weight (Tensor): the learnable weights of each embedding table is the module of shape
+                         `(num_embeddings, embedding_dim)` initialized using a uniform distribution
+                         with sqrt(1 / num_categories).
+
+    Inputs: :attr:`input` (LongTensor), :attr:`offsets` (LongTensor, optional), and
+        :attr:`per_index_weights` (Tensor, optional)
+
+        - If :attr:`input` is 2D of shape `(B, N)`,
+
+          it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
+          this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
+          :attr:`offsets` is ignored and required to be ``None`` in this case.
+
+        - If :attr:`input` is 1D of shape `(N)`,
+
+          it will be treated as a concatenation of multiple bags (sequences).
+          :attr:`offsets` is required to be a 1D tensor containing the
+          starting index positions of each bag in :attr:`input`. Therefore,
+          for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as
+          having ``B`` bags. Empty bags (i.e., having 0-length) will have
+          returned vectors filled by zeros.
+
+        per_sample_weights (Tensor, optional): a tensor of float / double weights, or None
+            to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights`
+            must have exactly the same shape as input and is treated as having the same
+            :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``.
+
+
+    Output shape: `(B, embedding_dim)`
+
+    """
+    __constants__ = ['num_categories', 'embedding_dim', 'num_collisions',
+                     'operation', 'max_norm', 'norm_type', 'scale_grad_by_freq',
+                     'mode', 'sparse']
+
+    def __init__(self, num_categories, embedding_dim, num_collisions,
+                 operation='mult', max_norm=None, norm_type=2.,
+                 scale_grad_by_freq=False, mode='mean', sparse=False,
+                 _weight=None):
+        super(QREmbeddingBag, self).__init__()
+
+        assert operation in ['concat', 'mult', 'add'], 'Not valid operation!'
+
+        self.num_categories = num_categories
+        if isinstance(embedding_dim, int) or len(embedding_dim) == 1:
+            self.embedding_dim = [embedding_dim, embedding_dim]
+        else:
+            self.embedding_dim = embedding_dim
+        self.num_collisions = num_collisions
+        self.operation = operation
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+
+        if self.operation == 'add' or self.operation == 'mult':
+            assert self.embedding_dim[0] == self.embedding_dim[1], \
+                'Embedding dimensions do not match!'
+
+        self.num_embeddings = [int(np.ceil(num_categories / num_collisions)),
+            num_collisions]
+
+        if _weight is None:
+            self.weight_q = Parameter(torch.Tensor(self.num_embeddings[0], self.embedding_dim[0]))
+            self.weight_r = Parameter(torch.Tensor(self.num_embeddings[1], self.embedding_dim[1]))
+            self.reset_parameters()
+        else:
+            assert list(_weight[0].shape) == [self.num_embeddings[0], self.embedding_dim[0]], \
+                'Shape of weight for quotient table does not match num_embeddings and embedding_dim'
+            assert list(_weight[1].shape) == [self.num_embeddings[1], self.embedding_dim[1]], \
+                'Shape of weight for remainder table does not match num_embeddings and embedding_dim'
+            self.weight_q = Parameter(_weight[0])
+            self.weight_r = Parameter(_weight[1])
+        self.mode = mode
+        self.sparse = sparse
+
+    def reset_parameters(self):
+        nn.init.uniform_(self.weight_q, np.sqrt(1 / self.num_categories))
+        nn.init.uniform_(self.weight_r, np.sqrt(1 / self.num_categories))
+
+    def forward(self, input, offsets=None, per_sample_weights=None):
+        input_q = (input / self.num_collisions).long()
+        input_r = torch.remainder(input, self.num_collisions).long()
+
+        embed_q = F.embedding_bag(input_q, self.weight_q, offsets, self.max_norm,
+                                  self.norm_type, self.scale_grad_by_freq, self.mode,
+                                  self.sparse, per_sample_weights)
+        embed_r = F.embedding_bag(input_r, self.weight_r, offsets, self.max_norm,
+                                  self.norm_type, self.scale_grad_by_freq, self.mode,
+                                  self.sparse, per_sample_weights)
+
+        if self.operation == 'concat':
+            embed = torch.cat((embed_q, embed_r), dim=1)
+        elif self.operation == 'add':
+            embed = embed_q + embed_r
+        elif self.operation == 'mult':
+            embed = embed_q * embed_r
+
+        return embed
+
+    def extra_repr(self):
+        s = '{num_embeddings}, {embedding_dim}'
+        if self.max_norm is not None:
+            s += ', max_norm={max_norm}'
+        if self.norm_type != 2:
+            s += ', norm_type={norm_type}'
+        if self.scale_grad_by_freq is not False:
+            s += ', scale_grad_by_freq={scale_grad_by_freq}'
+        s += ', mode={mode}'
+        return s.format(**self.__dict__)
diff --git a/benchmarks/dlrm/ubench/README_comms.md b/benchmarks/dlrm/ubench/README_comms.md
new file mode 100644
index 0000000..5a76db0
--- /dev/null
+++ b/benchmarks/dlrm/ubench/README_comms.md
@@ -0,0 +1,5 @@
+# dlrm_ubench_comms_driver.py runs <proxyworkloads root>/param/train/comms/pt/comms.py.
+
+# Note
+If <proxyworkloads roots>/param is empty, change to that directory and run:
+git submodule update --init --recursive
diff --git a/benchmarks/dlrm/ubench/dlrm_ubench_comms_driver.py b/benchmarks/dlrm/ubench/dlrm_ubench_comms_driver.py
new file mode 100644
index 0000000..e157fc0
--- /dev/null
+++ b/benchmarks/dlrm/ubench/dlrm_ubench_comms_driver.py
@@ -0,0 +1,130 @@
+import argparse
+import contextlib
+import io
+import itertools
+import os
+import pathlib
+import subprocess
+import sys
+from itertools import product
+from os import fspath
+
+# param ubenches
+p = pathlib.Path(__file__).parent.resolve() / "../../../param/train/compute/pt"
+sys.path.append(fspath(p))
+import dataset
+import pytorch_emb as kemb
+import pytorch_gemm as kgemm
+import pytorch_linear as klinear
+
+# FB5 Logger
+p = pathlib.Path(__file__).parent.resolve() / "../../../fb5logging"
+sys.path.append(fspath(p))
+import loggerconstants
+from fb5logger import FB5Logger
+
+
+def main():
+    parser = argparse.ArgumentParser(description="comms.py driver")
+    parser.add_argument(
+        "--size",
+        type=str,
+        default="small",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default=("nccl"),
+        choices=["nccl", "gloo", "mpi", "ucc", "xla"],
+    )
+    parser.add_argument(
+        "--collective",
+        type=str,
+        default=("all_to_all"),
+        choices=["all_to_all", "all_reduce"],
+    )
+    parser.add_argument("--fb5logger", type=str, default=None)
+    args = parser.parse_args()
+
+    if args.size not in ["small", "medium", "large"] and not (
+        args.size.isdigit() and int(args.size) > 0
+    ):
+        sys.exit("The --size argument provided is not a valid positive integer.")
+
+    lookup = {
+        "small": 2200 if args.collective == "all_reduce" else 134000000,
+        "medium": 9944 if args.collective == "all_reduce" else 244000000,
+        "large": 22372 if args.collective == "all_reduce" else 544000000,
+        str(2200): "small" if args.collective == "all_reduce" else 2200,
+        str(9944): "medium" if args.collective == "all_reduce" else 9944,
+        str(22372): "large" if args.collective == "all_reduce" else 22372,
+        str(134000000): "small" if args.collective == "all_to_all" else 134000000,
+        str(244000000): "medium" if args.collective == "all_to_all" else 244000000,
+        str(544000000): "large" if args.collective == "all_to_all" else 544000000,
+    }
+    (x, y) = (args.size, lookup.get(args.size, args.size))
+    (size, name) = (x, y) if args.size.isdigit() else (y, x)
+
+    master_ip = "localhost"
+    num_compute_per_collective = 100
+    mm_dim = 1000
+    num_iter = 100
+
+    cmd = f"""
+        --f 2
+        --n {num_iter}
+        --master-ip {master_ip}
+        --master-port 22565
+        --collective {args.collective}
+        --b {size}
+        --e {size}
+        --num-compute {num_compute_per_collective}
+        --mm-dim {mm_dim}
+        --backend {args.backend}
+    """
+    sys.argv = cmd.replace("\n", " ").replace("  ", "").split()
+
+    print("")
+    comms_abs_dir_path = str(
+        pathlib.Path(__file__).absolute().parents[3].resolve() / "param/train/comms/pt"
+    )
+    sys.path.append(comms_abs_dir_path)
+    from comms import main as comms_main
+
+    fb5logger = FB5Logger(args.fb5logger)
+    fb5logger.header(
+        "DLRM",
+        "UBENCH",
+        "train",
+        "comms_" + args.collective.replace("_", "") + "_" + name,
+        score_metric=loggerconstants.GBPS,
+    )
+
+    comms_stdout = io.StringIO()
+    with contextlib.redirect_stdout(comms_stdout):
+        fb5logger.run_start()
+        comms_main()
+
+    output = comms_stdout.getvalue().split("\n")[-3:]
+    output = [_.split("\t") for _ in output]
+    output[1].insert(4, "")
+    output[0][4] = "Latency(us):"
+    output[0].insert(5, "p50")
+    output[0].pop(7)
+    output[0].pop(0)
+    output[1].pop(0)
+    extra_metadata = {}
+    for a, b in zip(output[0], output[1]):
+        extra_metadata[a.lstrip()] = b.lstrip()
+    fb5logger.run_stop(
+        num_batches=num_iter, batch_size=None, extra_metadata=extra_metadata
+    )
+
+    print(comms_stdout.getvalue())
+    print("-- Pretty Format --")
+    for a, b in zip(output[0], output[1]):
+        print("{:<15s}{:>4s}".format(a.lstrip(), b.lstrip()))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dlrm/ubench/dlrm_ubench_train_driver.py b/benchmarks/dlrm/ubench/dlrm_ubench_train_driver.py
new file mode 100644
index 0000000..15f407c
--- /dev/null
+++ b/benchmarks/dlrm/ubench/dlrm_ubench_train_driver.py
@@ -0,0 +1,122 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+import pathlib
+from os import fspath
+# param ubenches
+p = pathlib.Path(__file__).parent.resolve() / "../../../param/train/compute/pt"
+sys.path.append(fspath(p))
+import dataset
+import pytorch_gemm as kgemm
+import pytorch_emb as kemb
+import pytorch_linear as klinear
+
+# FB5 Logger
+p = pathlib.Path(__file__).parent.resolve() / "../../../fb5logging"
+sys.path.append(fspath(p))
+from fb5logger import FB5Logger
+import loggerconstants
+
+if __name__ == "__main__":
+
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Measuring the Compute Kernel Performance Using PyTorch"
+    )
+    parser.add_argument('--warmups', type=int, default=10, help="warmup times")
+    parser.add_argument('--steps', type=int, default=100, help="repeat times")
+    parser.add_argument('--device', type=str, choices=['cpu', 'gpu', 'tpu'], required=True, help='valid devices')
+    parser.add_argument("--fb5logger", type=str, default=None)
+
+    subparsers = parser.add_subparsers(title='kernels', dest='kernel')
+    subparsers.required = True
+
+    parser_emb = subparsers.add_parser('emb', help='measure EmbeddingBag performance')
+    parser_emb.add_argument('-d', '--dataset', default='B')
+    parser_emb.add_argument("--randomseed", type=int, default=0)
+    parser_emb.add_argument("--usexlabag", action='store_true', help='use xlabad instead of embeddingbag')
+    parser_emb.add_argument("--alpha", default=0.0, help="Zipf param. Use uniform if == 0.0")
+
+    parser_linear = subparsers.add_parser('linear', help='measure mlp performance')
+    parser_linear.add_argument('--optimizer-type', default='sgd', help='Optimizer: SGD', choices=['sgd'])
+    parser_linear.add_argument('-t', '--dtype', default='float', help="data type", choices=["float", "float16", "bfloat16"])
+    parser_linear.add_argument('-d', '--dataset', default='small')
+
+    # FB5 Logging
+
+    args=parser.parse_args()
+
+    print("Measuring the performance of ", args.kernel, " on device = ", args.device)
+    print("Steps = ", args.steps, " warmups = ", args.warmups)
+
+    #fb5 logging header
+    if args.fb5logger is not None:
+        fb5logger = FB5Logger(args.fb5logger)
+
+    if args.kernel == 'emb':
+        print("with emb dataset ", args.dataset)
+        global_bytes = 0
+        global_elap = 0
+        if args.fb5logger is not None:
+            fb5logger.header("DLRM", "UBENCH", "train", args.kernel + "_" + args.dataset, score_metric=loggerconstants.GBPS)
+            fb5logger.run_start()
+        if args.dataset == 'A':
+            run_dataset = dataset.emb_A
+        elif args.dataset == 'B':
+            run_dataset = dataset.emb_B
+        elif args.dataset == 'small':
+            small_dataset = [ (4800000, 56, 34, 2048),
+                        (4800000, 56, 34, 4096),]
+            run_dataset = small_dataset
+        else:
+            import ast
+            run_dataset = ast.literal_eval(args.dataset)
+        for i in range(len(run_dataset)):
+            features, embdim, nnz, batch = run_dataset[i]
+            elap, total_bytes = kemb.run_single(args, features, embdim, nnz, batch)
+            elap /= args.steps
+            total_bytes /= 1.0e6
+            global_bytes += total_bytes
+            global_elap += elap
+        if args.fb5logger is not None:
+            extra_metadata={"GB/s": global_bytes / global_elap / 1.0e3, "ELAP": global_elap, "BYTES": global_bytes}
+            fb5logger.run_stop(args.steps, batch, extra_metadata=extra_metadata)
+    else:
+        print("with linear dataset ", args.dataset, ", Data type: ", args.dtype)
+        global_flops = 0
+        global_elap = 0
+        if args.fb5logger is not None:
+            fb5logger.header("DLRM", "UBENCH", "train", args.kernel + "_" + args.dataset, score_metric=loggerconstants.TFPS)
+            fb5logger.run_start()
+        if args.dataset == 'A':
+            run_dataset = dataset.mlp_A
+        elif args.dataset == 'small':
+            small_dataset = [ (18, 1024, 1024, 1024, 128),
+                        (18, 1024, 1024, 1024, 256),]
+            run_dataset = small_dataset
+        else:
+            import ast
+            run_dataset = ast.literal_eval(args.dataset)
+        for i in range(len(run_dataset)):
+            layer_num, input_size, hidden_size, output_size, batch_size = run_dataset[i]
+            elap, loss = klinear.run_single(
+                args, layer_num, input_size, hidden_size, output_size, batch_size
+            )
+            elap /= args.steps
+
+            flops = batch_size * (
+                hidden_size * hidden_size * layer_num
+                + hidden_size * input_size
+                + hidden_size * output_size
+            )
+            # Forward 2x and Backward 4x
+            flops *= 6
+            global_flops += flops
+            global_elap += elap
+        if args.fb5logger is not None:
+            extra_metadata={"TF/s": global_flops / global_elap / 1.0e12, "ELAP": global_elap, "FLOPS": global_flops}
+            fb5logger.run_stop(args.steps, batch_size, extra_metadata=extra_metadata)
diff --git a/benchmarks/rnnt/ootb/inference/QSL.py b/benchmarks/rnnt/ootb/inference/QSL.py
new file mode 100644
index 0000000..3848ca3
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/QSL.py
@@ -0,0 +1,71 @@
+import sys
+import os
+from os import fspath
+import pathlib
+sys.path.insert(0, fspath(pathlib.Path(__file__).parent.resolve() / "./pytorch"))
+
+from parts.manifest import Manifest
+from parts.segment import AudioSegment
+
+import numpy as np
+
+import mlperf_loadgen as lg
+
+
+class AudioQSL:
+    def __init__(self, dataset_dir, manifest_filepath, labels,
+                 sample_rate=16000, perf_count=None):
+        m_paths = [manifest_filepath]
+        self.manifest = Manifest(dataset_dir, m_paths, labels, len(labels),
+                                 normalize=True, max_duration=15.0)
+        self.sample_rate = sample_rate
+        self.count = len(self.manifest)
+        perf_count = self.count if perf_count is None else perf_count
+        self.sample_id_to_sample = {}
+        self.qsl = lg.ConstructQSL(self.count, perf_count,
+                                   self.load_query_samples,
+                                   self.unload_query_samples)
+        print(
+            "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours. Number of samples: {2}".format(
+                self.manifest.duration / 3600,
+                self.manifest.filtered_duration / 3600,
+                self.count))
+
+    def load_query_samples(self, sample_list):
+        for sample_id in sample_list:
+            self.sample_id_to_sample[sample_id] = self._load_sample(sample_id)
+
+    def unload_query_samples(self, sample_list):
+        for sample_id in sample_list:
+            del self.sample_id_to_sample[sample_id]
+
+    def _load_sample(self, index):
+        sample = self.manifest[index]
+        segment = AudioSegment.from_file(sample['audio_filepath'][0],
+                                         target_sr=self.sample_rate)
+        waveform = segment.samples
+        assert isinstance(waveform, np.ndarray) and waveform.dtype == np.float32
+        return waveform
+
+    def __getitem__(self, index):
+        return self.sample_id_to_sample[index]
+
+    def __del__(self):
+        lg.DestroyQSL(self.qsl)
+        print("Finished destroying QSL.")
+
+
+# We have no problem fitting all data in memory, so we do that, in
+# order to speed up execution of the benchmark.
+class AudioQSLInMemory(AudioQSL):
+    def __init__(self, dataset_dir, manifest_filepath, labels,
+                 sample_rate=16000, perf_count=None):
+        super().__init__(dataset_dir, manifest_filepath, labels,
+                         sample_rate, perf_count)
+        super().load_query_samples(range(self.count))
+
+    def load_query_samples(self, sample_list):
+        pass
+
+    def unload_query_samples(self, sample_list):
+        pass
diff --git a/benchmarks/rnnt/ootb/inference/README.md b/benchmarks/rnnt/ootb/inference/README.md
new file mode 100644
index 0000000..27fbabd
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/README.md
@@ -0,0 +1,116 @@
+# 1. Problem 
+Speech recognition accepts raw audio samples and produces a corresponding
+character transcription, without an external language model.
+
+# 2. Directions
+
+Open `run.sh`. Set the stage variable to "-1". Set "work_dir" to a
+path backed by a disk with at least 30 GB of space. Most space is used
+by loadgen logs, not the data or model. You need conda and a C/C++
+compiler on your PATH. I used conda 4.8.2. This script is responsible
+for downloading dependencies, data, and the model.
+
+Run `./run.sh` from this directory. Note that stage 3 runs all of the
+scenarios for the reference implementation, which will take a long
+time, so you may want to exist before then.
+
+As you complete individual stages, you can set the variable "stage" to
+a higher number for restarting from a later stage.
+
+# 3. Dataset/Environment
+### Publication/Attribution
+["OpenSLR LibriSpeech Corpus"](http://www.openslr.org/12/) provides over 1000 hours of speech data in the form of raw audio.
+We use dev-clean, which is approximately 5 hours. We remove all samples with a length exceeding 15 seconds.
+
+### Data preprocessing
+Log filterbanks of size 80 are extracted every 10 milliseconds, from
+windows of size 20 milliseconds. Note that every three filterbanks are
+concatenated together ("feature splicing"), so the model's effective
+frame rate is actually 30 milliseconds.
+
+No dithering takes place.
+
+This is not typical preprocessing, since it takes place as part of the
+model's measured runtime, not before the model runs.
+
+### Test data order
+
+Look at dev-clean-wav.json generated by run.sh. It looks like this:
+
+```
+[
+  {
+    "files": [
+      {
+        "channels": 1,
+        "sample_rate": 16000.0,
+        "bitrate": 16,
+        "duration": 6.59,
+        "num_samples": 105440,
+        "encoding": "Signed Integer PCM",
+        "silent": false,
+        "fname": "dev-clean-wav/2277/149896/2277-149896-0000.wav",
+        "speed": 1
+      }
+    ],
+    "original_duration": 6.59,
+    "original_num_samples": 105440,
+    "transcript": "he was in a fevered state of mind owing to the blight his wife's action threatened to cast upon his entire future"
+  },
+  {
+    "files": [
+      {
+        "channels": 1,
+        "sample_rate": 16000.0,
+        "bitrate": 16,
+        "duration": 7.145,
+        "num_samples": 114320,
+        "encoding": "Signed Integer PCM",
+        "silent": false,
+        "fname": "dev-clean-wav/2277/149896/2277-149896-0001.wav",
+        "speed": 1
+      }
+    ],
+    "original_duration": 7.145,
+    "original_num_samples": 114320,
+    "transcript": "he would have to pay her the money which she would now regularly demand or there would be trouble it did not matter what he did"
+  },
+  ...
+]
+```
+
+The data is loaded into memory. Then all samples with a duration above
+15 seconds are filtered out. Then the first object in the array is
+assigned query id 0, the second is assigned query id 1, etc. The
+unfiltered file is uploaded to the directory containing README in case
+you do not want to recreate this file.
+
+# 4. Model
+This is a variant of the model described in sections 3.1 and 6.2 of:
+
+@article{,
+  title={STREAMING END-TO-END SPEECH RECOGNITION FOR MOBILE DEVICES},
+  author={Yanzhang He, Tara N. Sainath, Rohit Prabhavalkar, Ian McGraw, Raziel Alvarez, Ding Zhao,
+  David Rybach, Anjuli Kannan, Yonghui Wu, Ruoming Pang, Qiao Liang, Deepti Bhatia, Yuan Shangguan,
+  Bo Li, Golan Pundak, Khe Chai Sim, Tom Bagby, Shuo-yiin Chang, Kanishka Rao, Alexander Gruenstein},
+  journal={arXiv preprint arXiv:1811.06621},
+  year={2018}
+}
+
+The differences are as follows:
+
+1. The model has 45.3 million parameters, rather than 120 million parameters
+1. The LSTMs are not followed by projection layers
+1. No layer normalization is used
+1. Hidden dimensions are smaller.
+1. The prediction network is made of two LSTMs, rather than seven.
+1. The labels are characters, rather than word pieces.
+1. No quantization is done at this time for inference.
+1. A greedy decoder is used, rather than a beamsearch decoder. This greatly
+   reduces inference complexity.
+
+# 5. Quality
+### Quality metric
+7.452253714852645% Word Error Rate (WER) across all words in the output text of
+all samples less than 15 seconds in length in the dev-clean set, using a greedy
+decoder and a fully FP32 model.
\ No newline at end of file
diff --git a/benchmarks/rnnt/ootb/inference/accuracy_eval.py b/benchmarks/rnnt/ootb/inference/accuracy_eval.py
new file mode 100644
index 0000000..ea81792
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/accuracy_eval.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+import argparse
+import array
+import json
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pytorch"))
+
+from QSL import AudioQSL
+from helpers import process_evaluation_epoch, __gather_predictions
+from parts.manifest import Manifest
+
+dtype_map = {
+    "int8": 'b',
+    "int16": 'h',
+    "int32": 'l',
+    "int64": 'q',
+}
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", required=True)
+    parser.add_argument("--dataset_dir", required=True)
+    parser.add_argument("--manifest", required=True)
+    parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type")
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+    qsl = AudioQSL(args.dataset_dir, args.manifest, labels)
+    manifest = qsl.manifest
+    with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh:
+        results = json.load(fh)
+    hypotheses = []
+    references = []
+    for result in results:
+        hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist())
+        references.append(manifest[result["qsl_idx"]]["transcript"])
+
+    references = __gather_predictions([references], labels=labels)
+    hypotheses = __gather_predictions([hypotheses], labels=labels)
+
+    d = dict(predictions=hypotheses,
+             transcripts=references)
+    wer = process_evaluation_epoch(d)
+    print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100))
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/rnnt/ootb/inference/environment.yml b/benchmarks/rnnt/ootb/inference/environment.yml
new file mode 100644
index 0000000..4958247
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/environment.yml
@@ -0,0 +1,128 @@
+name: mlperf-rnnt
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - absl-py=0.9.0=py36_0
+  - blas=1.0=mkl
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2020.4.5.1=hecc5488_0
+  - certifi=2020.4.5.1=py36h9f0ad1d_0
+  - cffi=1.14.0=py36h2e261b9_0
+  - cmake=3.14.0=h52cb24c_0
+  - cudatoolkit=10.1.243=h6bb024c_0
+  - cudatoolkit-dev=10.1.243=h516909a_3
+  - expat=2.2.6=he6710b0_0
+  - freetype=2.9.1=h8a8886c_1
+  - gdb=8.3.1=py36h497da48_1
+  - intel-openmp=2020.0=166
+  - jpeg=9b=h024ee3a_2
+  - krb5=1.17.1=h173b8e3_0
+  - lame=3.100=h14c3975_1001
+  - ld_impl_linux-64=2.33.1=h53a641e_7
+  - libcurl=7.69.1=h20c2e04_0
+  - libedit=3.1.20181209=hc058e9b_0
+  - libffi=3.2.1=hd88cf55_4
+  - libgcc-ng=9.1.0=hdf63c60_0
+  - libgfortran-ng=7.3.0=hdf63c60_0
+  - libpng=1.6.37=hbc83047_0
+  - libssh2=1.9.0=h1ba5d50_1
+  - libstdcxx-ng=9.1.0=hdf63c60_0
+  - libtiff=4.1.0=h2733197_0
+  - mad=0.15.1b=he1b5a44_0
+  - mkl=2020.0=166
+  - mkl-include=2020.0=166
+  - mkl-service=2.3.0=py36he904b0f_0
+  - mkl_fft=1.0.15=py36ha843d7b_0
+  - mkl_random=1.1.0=py36hd6b4f25_0
+  - ncurses=6.1=hf484d3e_1002
+  - ninja=1.9.0=py36hfd86e86_0
+  - numpy=1.18.1=py36h4f9e942_0
+  - numpy-base=1.18.1=py36hde5b4d6_1
+  - olefile=0.46=py_0
+  - openssl=1.1.1g=h516909a_0
+  - pillow=7.0.0=py36hb39fc2d_0
+  - pip=20.0.2=py36_1
+  - pycparser=2.20=py_0
+  - python=3.6.10=h0371630_0
+  - python_abi=3.6=1_cp36m
+  - pytorch=1.5.0=py3.6_cuda10.1.243_cudnn7.6.3_0
+  - pyyaml=5.3.1=py36h7b6447c_0
+  - readline=7.0=hf8c457e_1001
+  - rhash=1.3.8=h1ba5d50_0
+  - setuptools=46.1.3=py36_0
+  - six=1.14.0=py36_0
+  - sqlite=3.31.1=h7b6447c_0
+  - tk=8.6.8=hbc83047_0
+  - torchvision=0.6.0=py36_cu101
+  - wheel=0.34.2=py36_0
+  - xz=5.2.4=h14c3975_4
+  - yaml=0.1.7=had09818_2
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.3.7=h0b5b093_0
+  - pip:
+    - ascii-graph==1.5.1
+    - attrs==19.3.0
+    - audioread==2.1.8
+    - autopep8==1.5.1
+    - backcall==0.1.0
+    - chardet==3.0.4
+    - coverage==5.0.4
+    - decorator==4.4.2
+    - entrypoints==0.3
+    - flake8==3.7.9
+    - grpcio==1.28.1
+    - idna==2.9
+    - importlib-metadata==1.6.0
+    - inflect==4.1.0
+    - ipdb==0.13.2
+    - ipython==7.13.0
+    - ipython-genutils==0.2.0
+    - jedi==0.16.0
+    - joblib==0.14.1
+    - librosa==0.7.2
+    - llvmlite==0.31.0
+    - markdown==3.2.1
+    - mccabe==0.6.1
+    - more-itertools==8.2.0
+    - numba==0.48.0
+    - onnx==1.6.0
+    - onnxruntime==1.2.0
+    - packaging==20.3
+    - pandas==0.24.2
+    - parso==0.6.2
+    - pexpect==4.8.0
+    - pickleshare==0.7.5
+    - pluggy==0.13.1
+    - prompt-toolkit==3.0.5
+    - protobuf==3.11.3
+    - ptyprocess==0.6.0
+    - py==1.8.1
+    - pycodestyle==2.5.0
+    - pyflakes==2.1.1
+    - pygments==2.6.1
+    - pyparsing==2.4.7
+    - pytest==5.4.2
+    - python-dateutil==2.8.1
+    - pytz==2019.3
+    - requests==2.23.0
+    - resampy==0.2.2
+    - scikit-learn==0.22.2.post1
+    - scipy==1.4.1
+    - soundfile==0.10.3.post1
+    - sox==1.3.7
+    - tensorboard==2.0.0
+    - toml==0.10.0
+    - tqdm==4.31.1
+    - traitlets==4.3.3
+    - typing-extensions==3.7.4.2
+    - unidecode==1.1.1
+    - urllib3==1.25.8
+    - wcwidth==0.1.9
+    - werkzeug==1.0.1
+    - wrapt==1.10.11
+    - zipp==3.1.0
+prefix: /cb/home/daniel/ws/miniconda3/envs/mlperf-rnnt
+
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/.clang-format b/benchmarks/rnnt/ootb/inference/loadgen/.clang-format
new file mode 100644
index 0000000..f08c9c2
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/.clang-format
@@ -0,0 +1,2 @@
+BasedOnStyle: Google
+Standard: Cpp11
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/CMakeLists.txt b/benchmarks/rnnt/ootb/inference/loadgen/CMakeLists.txt
new file mode 100644
index 0000000..7865287
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/CMakeLists.txt
@@ -0,0 +1,68 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(mlperf_loadgen)
+
+# The mlperf_loadgen version.
+set(mlperf_loadgen_VERSION_MAJOR 1)
+set(mlperf_loadgen_VERSION_MINOR 1)
+message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}")
+
+# Set build options. NB: CXX_STANDARD is supported since CMake 3.1.
+if (NOT MSVC)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall")
+endif()
+message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}")
+set(CMAKE_CXX_STANDARD "14")
+message(STATUS "Using C++ standard: ${CMAKE_CXX_STANDARD}")
+message(STATUS "Using static linker flags: ${CMAKE_STATIC_LINKER_FLAGS}")
+message(STATUS "Using shared linker flags: ${CMAKE_SHARED_LINKER_FLAGS}")
+
+# Output directory for libraries.
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+message(STATUS "Using output path: ${LIBRARY_OUTPUT_PATH}")
+
+# Detect Python to use for generating source file with version info.
+# NB: PythonInterp has been deprecated since CMake 3.12
+# but it works with earlier versions of CMake.
+find_package(PythonInterp)
+message(STATUS "Using Python interpreter: ${PYTHON_EXECUTABLE}")
+
+# Generate source file with version info.
+execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/version_generator.py ${CMAKE_BINARY_DIR}/version_generated.cc ${CMAKE_CURRENT_SOURCE_DIR})
+
+# Add source files.
+set(SOURCE
+  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/issue_query_controller.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/loadgen.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/logging.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/logging.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/version.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/version.h
+  ${CMAKE_BINARY_DIR}/version_generated.cc
+)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_library(mlperf_loadgen STATIC ${SOURCE})
+target_link_libraries(mlperf_loadgen)
+
+if(WIN32)
+set (LIBS "")
+else()
+set (LIBS pthread)
+endif()
+
+add_executable(benchmark benchmark/repro.cpp)
+target_link_libraries(benchmark PUBLIC mlperf_loadgen ${LIBS})
+
+# Install library and headers.
+install(TARGETS mlperf_loadgen
+	DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/
+	DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h")
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/README.md b/benchmarks/rnnt/ootb/inference/loadgen/README.md
new file mode 100644
index 0000000..e5329a1
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/README.md
@@ -0,0 +1,105 @@
+# Overview {#mainpage}
+
+*Note:* A compiled html version of this document is hosted online
+[here](https://mlperf.github.io/inference/loadgen/index.html).
+
+## Introduction
+
+* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures
+  the performance of inference systems.
+* It generates traffic for scenarios as formulated by a diverse set of experts
+  in the [MLPerf working group](https://mlperf.org/about).
+* The scenarios emulate the workloads seen in mobile devices,
+  autonomous vehicles, robotics, and cloud-based setups.
+* Although the LoadGen is not model or dataset aware, its strength is in its
+  reusability with logic that is.
+
+## Integration Example and Flow
+The following is an diagram of how the LoadGen can be integrated into an
+inference system, resembling how some of the MLPerf reference models are
+implemented.
+<div style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="loadgen_integration_diagram.svg" width="500px" style="padding: 20px">
+<ol style="padding: 20px">
+<li>Benchmark knows the model, dataset, and preprocessing.</li>
+<li>Benchmark hands dataset sample IDs to LoadGen.</li>
+<li>LoadGen starts generating queries of sample IDs.</li>
+<li>Benchmark creates requests to backend.</li>
+<li>Result is post processed and forwarded to LoadGen.</li>
+<li>LoadGen outputs logs for analysis.<br>
+</ol>
+</div>
+
+## Useful Links
+* [FAQ](@ref ReadmeFAQ)
+* [LoadGen Build Instructions](@ref ReadmeBuild)
+* [LoadGen API](@ref LoadgenAPI)
+* [Test Settings](@ref LoadgenAPITestSettings) -
+  A good description of available scenarios, modes, and knobs.
+* [MLPerf Inference Code](https://github.com/mlcommons/inference) -
+  Includes source for the LoadGen and reference models that use the LoadGen.
+* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) -
+  Any mismatch with this is a bug in the LoadGen.
+* [MLPerf Website](www.mlperf.org)
+
+## Scope of the LoadGen's Responsibilities
+
+### In Scope
+* **Provide a reusable** C++ library with python bindings.
+* **Implement** the traffic patterns of the MLPerf Inference scenarios and
+  modes.
+* **Record** all traffic generated and received for later analysis and
+  verification.
+* **Summarize** the results and whether performance constraints were met.
+* **Target high-performance** systems with efficient multi-thread friendly
+  logging utilities.
+* **Generate trust** via a shared, well-tested, and community-hardened
+  code base.
+
+### Out of Scope
+The LoadGen is:
+* **NOT** aware of the ML model it is running against.
+* **NOT** aware of the data formats of the model's inputs and outputs.
+* **NOT** aware of how to score the accuracy of a model's outputs.
+* **NOT** aware of MLPerf rules regarding scenario-specific constraints.
+
+Limitting the scope of the LoadGen in this way keeps it reusable across
+different models and datasets without modification. Using composition and
+dependency injection, the user can define their own model, datasets, and
+metrics.
+
+Additionally, not hardcoding MLPerf-specific test constraints, like test
+duration and performance targets, allows users to use the LoadGen unmodified
+for custom testing and continuous integration purposes.
+
+## Submission Considerations
+
+### Upstream all local modifications
+* As a rule, no local modifications to the LoadGen's C++ library are allowed
+for submission.
+* Please upstream early and often to keep the playing field level.
+
+### Choose your TestSettings carefully!
+* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf
+requirements for submission. *e.g.:* target percentiles and latencies.
+* For verification, the values in TestSettings are logged.
+* To help make sure your settings are spec compliant, use
+TestSettings::FromConfig in conjunction with the relevant config file provided
+with the reference models.
+
+## Responsibilities of a LoadGen User
+
+### Implement the Interfaces
+* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass
+  them to the StartTest function.
+* Call QuerySampleComplete for every sample received by
+  SystemUnderTest::IssueQuery.
+
+### Assess Accuracy
+* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine
+  the accuracy of your system.
+* For the official models, Python scripts will be provided by the MLPerf model
+  owners for you to do this automatically.
+
+For templates of how to do the above in detail, refer to code for the demos,
+tests, and reference models.
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/README_BUILD.md b/benchmarks/rnnt/ootb/inference/loadgen/README_BUILD.md
new file mode 100644
index 0000000..095a8d8
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/README_BUILD.md
@@ -0,0 +1,32 @@
+# Building the LoadGen {#ReadmeBuild}
+
+## Prerequisites
+
+    sudo apt-get install libglib2.0-dev python-pip python3-pip
+    pip2 install absl-py numpy
+    pip3 install absl-py numpy
+
+## Quick Start
+
+    pip install absl-py numpy
+    git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
+    cd mlperf_inference/loadgen
+    CFLAGS="-std=c++14 -O3" python setup.py bdist_wheel
+    pip install --force-reinstall dist/mlperf_loadgen-0.5a0-cp36-cp36m-linux_x86_64.whl
+    python demos/py_demo_single_stream.py
+
+This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo. The exact *.whl filename may differ on your system, but there should only be one resulting whl file for you to use.
+
+A summary of the test results can be found in the *"mlperf_log_summary.txt"* logfile.
+
+For a timeline visualization of what happened during the test, open the *"mlperf_log_trace.json"* file in Chrome:
+* Type “chrome://tracing” in the address bar, then drag-n-drop the json.
+* This may be useful for SUT performance tuning and understanding + debugging the loadgen.
+
+To build the loadgen as a C++ library, rather than a python module:
+
+    git clone https://github.com/mlcommons/inference.git mlperf_inference
+    cd mlperf_inference
+    mkdir loadgen/build/ && cd loadgen/build/
+    cmake .. && cmake --build .
+    cp libmlperf_loadgen.a ..
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/README_FAQ.md b/benchmarks/rnnt/ootb/inference/loadgen/README_FAQ.md
new file mode 100644
index 0000000..c1093a6
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/README_FAQ.md
@@ -0,0 +1,88 @@
+# LoadGen FAQ {#ReadmeFAQ}
+
+## Q: The LoadGen does not match the MLPerf specification. Who is right?
+**A:**
+The MLPerf spec is *always* right.
+Please file a LoadGen bug so it may be resolved.
+
+## Q: How can I file a bug?
+**A:**
+On GitHub: https://github.com/mlcommons/inference/issues/new
+
+## Q: Can I make local modifications to the LoadGen for submission?
+**A:**
+No. To keep the playing field level, please upstream any local
+modificiations you need to make. Ideally upstream such changes behind a runtime
+flag or via an abstract interface the client can implement. This will help
+with testability.
+
+## Q: Where can I find the results of a test?
+**A:**
+By default, the loadgen will output an *mlperf_log_summary.txt* file
+that summarizes the target metrics and constraints of the test, along with
+other stats about the run.
+
+*Note:* LogSettings also has a flag to forward the results to stdout and
+there's an outstanding TODO to make this more programmable.
+
+## Q: The reference implementation for \<*some_model*\> prints out results of its own. Are those for submission?
+**A:**
+They are not. The LoadGen results are the ground truth for submission
+results since they will work even for systems that forgo the python bindings.
+If you notice a bug in the LoadGen's results, please file a bug or submit a
+patch.
+
+## Q: I'm getting linker errors for LoadgenVersion definitions. Where is *version_generated.cc*?
+**A:**
+If you have a custom build setup, make sure you run the *version_generator.py*
+script, which will create the cc file you are looking for. The official build
+files that come with the LoadGen do this for you out of the box.
+
+## Q: What is this *version_generator.py* script?
+**A:**
+The LoadGen records git stats (if available) and the SHA1 of all its
+source files (always) at build time for verification purposes. This is easy
+to circumvent, but try your best to run *version_generator.py* correctly;
+ideally integrated with your build system if you have a custom build.
+The intention is more to help with debugging efforts and detect accidental
+version missmatches than to detect bad actors.
+
+## Q: How do I view the *mlperf_log_trace.json* file?
+**A:**
+This file uses the [Trace Event Format]
+(https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit)
+to record a timeline of all the threads involved.
+You can view the file by typing [chrome://tracing](chrome://tracing) into
+Chrome's address bar and dragging the json file there.
+This file zips well and you can drag the zip file directly into
+[chrome://tracing](chrome://tracing) too.
+Please include zipped traces (and the other logs) when filing bug reports.
+
+## Q: What is the difference between the MultiStream and MultiStreamFree scenarios?
+**A:**
+MultiStream corresponds to the official MLPerf scenario for submissions;
+it has a fixed query rate and allows only one outstanding query at a time.
+MultiStreamFree is implemented for evaluation purposes only; it sends queries
+as fast as possible and allows up to N outstanding queries at a time. You may
+want to use MultiStreamFree for development purposes since small improvements
+in performance will always be reflected in the results, whereas MultiStream's
+results will be quantized.
+
+## Q: Why is the code littered with so many lambdas? My eyes hurt.
+**A:**
+Lambdas are a convenient and efficient way to ship arbitrary data + deferred
+logic over to the logging thread without much boilerplate.
+Much of the loadgen is built on top of the logging utilities.
+Thus the lambdas. (Sorry about the eyes.)
+
+## Q: What C++ version does the LoadGen target?
+**A:**
+It currently targets and requires C++14. It should compile with recent
+versions of clang, gcc, and msvc.
+
+## Q: What dependencies does the LoadGen code have?
+**A:**
+The C++ code has no external dependencies. The loadgen itself, logging
+utilities, and unit test utilities are built solely on the C++ Standard Library.
+The python bindings, however, do require
+[pybind11](https://github.com/pybind/pybind11).
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/.gitignore b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/.gitignore
new file mode 100644
index 0000000..e792c8e
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/.gitignore
@@ -0,0 +1,2 @@
+loadgen_build
+build
\ No newline at end of file
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/README.md b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/README.md
new file mode 100644
index 0000000..24e8729
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/README.md
@@ -0,0 +1,10 @@
+Note: please install jemalloc first. See: http://jemalloc.net/
+Command: bash run.sh <target_qps> <0=Basic,1=Queue> <numCompleteThreads> <maxSizeInComplete> <server_coalesce_queries=0or1>
+
+Experiments:
+- On Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz
+- Basic SUT : 500-600k i/s
+- Basic SUT + jemalloc: 800-900k i/s (`bash run.sh 800000 0`)
+- Queued SUT (2 complete threads) + jemalloc: 1.2-1.3M i/s (`bash run.sh 1200000 1 2 2048`)
+- Queued SUT (2 complete threads) + jemalloc + server_coalesce_queries: 1.4-1.5M is/ (`bash run.sh 1400000 1 2 512 1`)
+- Basic SUT + jemalloc + server_coalesce_queries + 4 IssueQueryThreads: 2.4-2.5M is/ (`bash run.sh 2400000 0 2 512 1 4`)
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/repro.cpp b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/repro.cpp
new file mode 100644
index 0000000..8b4bc8a
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/repro.cpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cassert>
+#include <condition_variable>
+#include <deque>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "loadgen.h"
+#include "query_sample_library.h"
+#include "system_under_test.h"
+#include "test_settings.h"
+
+class QSL : public mlperf::QuerySampleLibrary {
+ public:
+  ~QSL() override{};
+  const std::string& Name() const override { return mName; }
+  size_t TotalSampleCount() override { return 1000000; }
+  size_t PerformanceSampleCount() override { return TotalSampleCount(); }
+  void LoadSamplesToRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {}
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {}
+
+ private:
+  std::string mName{"Dummy QSL"};
+};
+
+class BasicSUT : public mlperf::SystemUnderTest {
+ public:
+  BasicSUT() {
+    // Start with some large value so that we don't reallocate memory.
+    initResponse(10000);
+  }
+  ~BasicSUT() override {}
+  const std::string& Name() const override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    int n = samples.size();
+    if (n > mResponses.size()) {
+      std::cerr << "Warning: reallocating response buffer in BasicSUT. Maybe "
+                   "you should initResponse with larger value!?"
+                << std::endl;
+      initResponse(samples.size());
+    }
+    for (int i = 0; i < n; i++) {
+      mResponses[i].id = samples[i].id;
+    }
+    mlperf::QuerySamplesComplete(mResponses.data(), n);
+  }
+  void FlushQueries() override {}
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override{};
+
+ private:
+  void initResponse(int size) {
+    mResponses.resize(size,
+                      {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+  }
+  int mBuf{0};
+  std::string mName{"BasicSUT"};
+  std::vector<mlperf::QuerySampleResponse> mResponses;
+};
+
+class QueueSUT : public mlperf::SystemUnderTest {
+ public:
+  QueueSUT(int numCompleteThreads, int maxSize) {
+    // Each thread handle at most maxSize at a time.
+    std::cout << "QueueSUT: maxSize = " << maxSize << std::endl;
+    initResponse(numCompleteThreads, maxSize);
+    // Launch complete threads
+    for (int i = 0; i < numCompleteThreads; i++) {
+      mThreads.emplace_back(&QueueSUT::CompleteThread, this, i);
+    }
+  }
+  ~QueueSUT() override {
+    {
+      std::unique_lock<std::mutex> lck(mMtx);
+      mDone = true;
+      mCondVar.notify_all();
+    }
+    for (auto& thread : mThreads) {
+      thread.join();
+    }
+  }
+  const std::string& Name() const override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::unique_lock<std::mutex> lck(mMtx);
+    for (const auto& sample : samples) {
+      mIdQueue.push_back(sample.id);
+    }
+    // Let some worker thread to consume tasks
+    mCondVar.notify_one();
+  }
+  void FlushQueries() override {}
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override{};
+
+ private:
+  void CompleteThread(int threadIdx) {
+    auto& responses = mResponses[threadIdx];
+    size_t maxSize{responses.size()};
+    size_t actualSize{0};
+    while (true) {
+      {
+        std::unique_lock<std::mutex> lck(mMtx);
+        mCondVar.wait(lck, [&]() { return !mIdQueue.empty() || mDone; });
+
+        if (mDone) {
+          break;
+        }
+
+        actualSize = std::min(maxSize, mIdQueue.size());
+        for (int i = 0; i < actualSize; i++) {
+          responses[i].id = mIdQueue.front();
+          mIdQueue.pop_front();
+        }
+        mCondVar.notify_one();
+      }
+      mlperf::QuerySamplesComplete(responses.data(), actualSize);
+    }
+  }
+  void initResponse(int numCompleteThreads, int size) {
+    mResponses.resize(numCompleteThreads);
+    for (auto& responses : mResponses) {
+      responses.resize(size,
+                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+    }
+  }
+  int mBuf{0};
+  std::string mName{"QueueSUT"};
+  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
+  std::vector<std::thread> mThreads;
+  std::deque<mlperf::ResponseId> mIdQueue;
+  std::mutex mMtx;
+  std::condition_variable mCondVar;
+  bool mDone{false};
+};
+
+class MultiBasicSUT : public mlperf::SystemUnderTest {
+ public:
+  MultiBasicSUT(int numThreads)
+      : mNumThreads(numThreads), mResponses(numThreads) {
+    // Start with some large value so that we don't reallocate memory.
+    initResponse(10000);
+    for (int i = 0; i < mNumThreads; ++i) {
+      mThreads.emplace_back(&MultiBasicSUT::startIssueThread, this, i);
+    }
+  }
+  ~MultiBasicSUT() override {
+    for (auto& thread : mThreads) {
+      thread.join();
+    }
+  }
+  const std::string& Name() const override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    int thread_idx = mThreadMap[std::this_thread::get_id()];
+    int n = samples.size();
+    auto& reponses = mResponses[thread_idx];
+    if (n > reponses.size()) {
+      std::cout
+          << "Warning: reallocating response buffer in MultiBasicSUT. Maybe "
+             "you should initResponse with larger value!?"
+          << std::endl;
+      initResponse(samples.size());
+    }
+    for (int i = 0; i < n; i++) {
+      reponses[i].id = samples[i].id;
+    }
+    mlperf::QuerySamplesComplete(reponses.data(), n);
+  }
+  void FlushQueries() override {}
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override{};
+
+ private:
+  void initResponse(int size) {
+    for (auto& responses : mResponses) {
+      responses.resize(size,
+                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+    }
+  }
+  void startIssueThread(int thread_idx) {
+    {
+      std::lock_guard<std::mutex> lock(mMtx);
+      mThreadMap[std::this_thread::get_id()] = thread_idx;
+    }
+    mlperf::RegisterIssueQueryThread();
+  }
+  int mBuf{0};
+  int mNumThreads{0};
+  std::string mName{"MultiBasicSUT"};
+  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
+  std::mutex mMtx;
+  std::vector<std::thread> mThreads;
+  std::map<std::thread::id, int> mThreadMap;
+};
+
+int main(int argc, char** argv) {
+  assert(argc >= 2 && "Need to pass in at least one argument: target_qps");
+  int target_qps = std::stoi(argv[1]);
+  std::cout << "target_qps = " << target_qps << std::endl;
+
+  bool useQueue{false};
+  int numCompleteThreads{4};
+  int maxSize{1};
+  bool server_coalesce_queries{false};
+  int num_issue_threads{0};
+  if (argc >= 3) {
+    useQueue = std::stoi(argv[2]) != 0;
+  }
+  if (argc >= 4) {
+    numCompleteThreads = std::stoi(argv[3]);
+  }
+  if (argc >= 5) {
+    maxSize = std::stoi(argv[4]);
+  }
+  if (argc >= 6) {
+    server_coalesce_queries = std::stoi(argv[5]) != 0;
+  }
+  if (argc >= 7) {
+    num_issue_threads = std::stoi(argv[6]);
+  }
+
+  QSL qsl;
+  std::unique_ptr<mlperf::SystemUnderTest> sut;
+
+  // Configure the test settings
+  mlperf::TestSettings testSettings;
+  testSettings.scenario = mlperf::TestScenario::Server;
+  testSettings.mode = mlperf::TestMode::PerformanceOnly;
+  testSettings.server_target_qps = target_qps;
+  testSettings.server_target_latency_ns = 10000000;  // 10ms
+  testSettings.server_target_latency_percentile = 0.99;
+  testSettings.min_duration_ms = 60000;
+  testSettings.min_query_count = 270000;
+  testSettings.server_coalesce_queries = server_coalesce_queries;
+  std::cout << "testSettings.server_coalesce_queries = "
+            << (server_coalesce_queries ? "True" : "False") << std::endl;
+  testSettings.server_num_issue_query_threads = num_issue_threads;
+  std::cout << "num_issue_threads = " << num_issue_threads << std::endl;
+
+  // Configure the logging settings
+  mlperf::LogSettings logSettings;
+  logSettings.log_output.outdir = "build";
+  logSettings.log_output.prefix = "mlperf_log_";
+  logSettings.log_output.suffix = "";
+  logSettings.log_output.prefix_with_datetime = false;
+  logSettings.log_output.copy_detail_to_stdout = false;
+  logSettings.log_output.copy_summary_to_stdout = true;
+  logSettings.log_mode = mlperf::LoggingMode::AsyncPoll;
+  logSettings.log_mode_async_poll_interval_ms = 1000;
+  logSettings.enable_trace = false;
+
+  // Choose SUT
+  if (num_issue_threads == 0) {
+    if (useQueue) {
+      std::cout << "Using QueueSUT with " << numCompleteThreads
+                << " complete threads" << std::endl;
+      sut.reset(new QueueSUT(numCompleteThreads, maxSize));
+    } else {
+      std::cout << "Using BasicSUT" << std::endl;
+      sut.reset(new BasicSUT());
+    }
+  } else {
+    if (useQueue) {
+      std::cout << "Using MultiQueueSUT with " << numCompleteThreads
+                << " complete threads" << std::endl;
+      std::cerr << "!!!! MultiQueueSUT is NOT implemented yet !!!!"
+                << std::endl;
+      return 1;
+      // sut.reset(new MultiQueueSUT(num_issue_threads, numCompleteThreads,
+      // maxSize));
+    } else {
+      std::cout << "Using MultiBasicSUT" << std::endl;
+      sut.reset(new MultiBasicSUT(num_issue_threads));
+    }
+  }
+
+  // Start test
+  std::cout << "Start test..." << std::endl;
+  mlperf::StartTest(sut.get(), &qsl, testSettings, logSettings);
+  std::cout << "Test done. Clean up SUT..." << std::endl;
+  sut.reset();
+  std::cout << "Done!" << std::endl;
+  return 0;
+}
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run.sh b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run.sh
new file mode 100644
index 0000000..62559c1
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+echo "Building loadgen..."
+if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
+cd loadgen_build && cmake ../.. && make -j && cd ..
+echo "Building test program..."
+if [ ! -e build ]; then mkdir build; fi;
+g++ --std=c++11 -O3 -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 build/repro.exe $1 $2 $3 $4 $5 $6
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run_debug.sh b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run_debug.sh
new file mode 100644
index 0000000..ba63727
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/benchmark/run_debug.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+echo "Building loadgen in Debug mode..."
+if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
+cd loadgen_build && cmake -DCMAKE_BUILD_TYPE=Debug ../.. && make -j && cd ..
+echo "Building test program in Debug mode..."
+if [ ! -e build ]; then mkdir build; fi;
+g++ --std=c++11 -O0 -g -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
+gdb --args build/repro.exe $1 $2 $3 $4 $5 $6
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.cc b/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.cc
new file mode 100644
index 0000000..9de41da
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.cc
@@ -0,0 +1,168 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "c_api.h"
+
+#include <string>
+
+#include "../loadgen.h"
+#include "../query_sample.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+
+namespace mlperf {
+namespace c {
+namespace {
+
+// Forwards SystemUnderTest calls to relevant callbacks.
+class SystemUnderTestTrampoline : public SystemUnderTest {
+ public:
+  SystemUnderTestTrampoline(
+      ClientData client_data, std::string name, IssueQueryCallback issue_cb,
+      FlushQueriesCallback flush_queries_cb,
+      ReportLatencyResultsCallback report_latency_results_cb)
+      : client_data_(client_data),
+        name_(std::move(name)),
+        issue_cb_(issue_cb),
+        flush_queries_cb_(flush_queries_cb),
+        report_latency_results_cb_(report_latency_results_cb) {}
+  ~SystemUnderTestTrampoline() override = default;
+
+  const std::string& Name() const override { return name_; }
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    (*issue_cb_)(client_data_, samples.data(), samples.size());
+  }
+
+  void FlushQueries() override { (*flush_queries_cb_)(); }
+
+  void ReportLatencyResults(
+      const std::vector<QuerySampleLatency>& latencies_ns) override {
+    (*report_latency_results_cb_)(client_data_, latencies_ns.data(),
+                                  latencies_ns.size());
+  }
+
+ private:
+  ClientData client_data_;
+  std::string name_;
+  IssueQueryCallback issue_cb_;
+  FlushQueriesCallback flush_queries_cb_;
+  ReportLatencyResultsCallback report_latency_results_cb_;
+};
+
+}  // namespace
+
+void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
+                   IssueQueryCallback issue_cb,
+                   FlushQueriesCallback flush_queries_cb,
+                   ReportLatencyResultsCallback report_latency_results_cb) {
+  SystemUnderTestTrampoline* sut = new SystemUnderTestTrampoline(
+      client_data, std::string(name, name_length), issue_cb, flush_queries_cb,
+      report_latency_results_cb);
+  return reinterpret_cast<void*>(sut);
+}
+
+void DestroySUT(void* sut) {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+namespace {
+
+// Forwards QuerySampleLibrary calls to relevant callbacks.
+class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
+ public:
+  QuerySampleLibraryTrampoline(
+      ClientData client_data, std::string name, size_t total_sample_count,
+      size_t performance_sample_count,
+      LoadSamplesToRamCallback load_samples_to_ram_cb,
+      UnloadSamplesFromRamCallback unload_samples_from_ram_cb)
+      : client_data_(client_data),
+        name_(std::move(name)),
+        total_sample_count_(total_sample_count),
+        performance_sample_count_(performance_sample_count),
+        load_samples_to_ram_cb_(load_samples_to_ram_cb),
+        unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {}
+  ~QuerySampleLibraryTrampoline() override = default;
+
+  const std::string& Name() const override { return name_; }
+  size_t TotalSampleCount() override { return total_sample_count_; }
+  size_t PerformanceSampleCount() override { return performance_sample_count_; }
+
+  void LoadSamplesToRam(const std::vector<QuerySampleIndex>& samples) override {
+    (*load_samples_to_ram_cb_)(client_data_, samples.data(), samples.size());
+  }
+  void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) override {
+    (*unload_samples_from_ram_cb_)(client_data_, samples.data(),
+                                   samples.size());
+  }
+
+ private:
+  ClientData client_data_;
+  std::string name_;
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  LoadSamplesToRamCallback load_samples_to_ram_cb_;
+  UnloadSamplesFromRamCallback unload_samples_from_ram_cb_;
+};
+
+}  // namespace
+
+void* ConstructQSL(ClientData client_data, const char* name, size_t name_length,
+                   size_t total_sample_count, size_t performance_sample_count,
+                   LoadSamplesToRamCallback load_samples_to_ram_cb,
+                   UnloadSamplesFromRamCallback unload_samples_from_ram_cb) {
+  QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline(
+      client_data, std::string(name, name_length), total_sample_count,
+      performance_sample_count, load_samples_to_ram_cb,
+      unload_samples_from_ram_cb);
+  return reinterpret_cast<void*>(qsl);
+}
+
+void DestroyQSL(void* qsl) {
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  delete qsl_cast;
+}
+
+// mlperf::c::StartTest just forwards to mlperf::StartTest after doing the
+// proper cast.
+void StartTest(void* sut, void* qsl, const TestSettings& settings) {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  LogSettings default_log_settings;
+  mlperf::StartTest(sut_cast, qsl_cast, settings, default_log_settings);
+}
+
+void QuerySamplesComplete(QuerySampleResponse* responses,
+                          size_t response_count) {
+  mlperf::QuerySamplesComplete(responses, response_count);
+}
+
+void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
+                                    size_t response_count, ResponseCallback response_cb,
+                                    ClientData client_data) {
+  mlperf::QuerySamplesComplete(responses, response_count,
+      [client_data, response_cb] (QuerySampleResponse* response) {
+        response_cb(client_data, response);
+      });
+}
+
+void RegisterIssueQueryThread() { mlperf::RegisterIssueQueryThread(); }
+
+}  // namespace c
+}  // namespace mlperf
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.h b/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.h
new file mode 100644
index 0000000..cf1a859
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/bindings/c_api.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A C API wrapping the C++ loadgen. Not tested. Needs work.
+/// \details The C API allows a C or Python client to easily create
+/// a SystemUnderTest without having to expose the SystemUnderTest class
+/// directly.
+/// ConstructSUT works with a bunch of function poitners instead that are
+/// called from an underlying trampoline class.
+
+#ifndef SYSTEM_UNDER_TEST_C_API_H_
+#define SYSTEM_UNDER_TEST_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "../query_sample.h"
+#include "../test_settings.h"
+
+namespace mlperf {
+
+namespace c {
+
+/// \brief Optional opaque client data that creators of SUTs and QSLs can have
+/// the loadgen pass back to their callback invocations.
+/// Helps avoids global variables.
+typedef uintptr_t ClientData;
+
+typedef void (*IssueQueryCallback)(ClientData, const QuerySample*, size_t);
+typedef void (*FlushQueriesCallback)();
+typedef void (*ReportLatencyResultsCallback)(ClientData, const int64_t*,
+                                             size_t);
+typedef void (*ResponseCallback)(ClientData, QuerySampleResponse*);
+
+/// \brief SUT calls this function to report query result back to loadgen
+void QuerySamplesComplete(QuerySampleResponse* responses,
+                          size_t response_count);
+
+void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
+                                    size_t response_count,
+                                    ResponseCallback response_cb,
+                                    ClientData client_data);
+
+/// \brief Create an opaque SUT pointer based on C callbacks.
+void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
+                   IssueQueryCallback issue_cb,
+                   FlushQueriesCallback flush_queries_cb,
+                   ReportLatencyResultsCallback report_latency_results_cb);
+/// \brief Destroys the SUT created by ConstructSUT.
+void DestroySUT(void* sut);
+
+typedef void (*LoadSamplesToRamCallback)(ClientData, const QuerySampleIndex*,
+                                         size_t);
+typedef void (*UnloadSamplesFromRamCallback)(ClientData,
+                                             const QuerySampleIndex*, size_t);
+
+/// \brief Create an opaque QSL pointer based on C callbacks.
+void* ConstructQSL(ClientData client_data, const char* name, size_t name_length,
+                   size_t total_sample_count, size_t performance_sample_count,
+                   LoadSamplesToRamCallback load_samples_to_ram_cb,
+                   UnloadSamplesFromRamCallback unload_samples_from_ram_cb);
+/// \brief Destroys the QSL created by ConsructQSL.
+void DestroyQSL(void* qsl);
+
+/// \brief Run tests on a SUT created by ConstructSUT().
+/// \details This is the C entry point. See mlperf::StartTest for the C++ entry
+/// point.
+void StartTest(void* sut, void* qsl, const TestSettings& settings);
+
+///
+/// \brief Register a thread for query issuing in Server scenario.
+/// \details This is the C entry point. See mlperf::RegisterIssueQueryThread for the C++ entry
+/// point.
+///
+void RegisterIssueQueryThread();
+
+}  // namespace c
+}  // namespace mlperf
+
+#endif  // SYSTEM_UNDER_TEST_C_API_H_
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/bindings/python_api.cc b/benchmarks/rnnt/ootb/inference/loadgen/bindings/python_api.cc
new file mode 100644
index 0000000..140604e
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/bindings/python_api.cc
@@ -0,0 +1,397 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Python bindings for the loadgen using pybind11.
+
+#ifndef PYTHON_BINDINGS_H
+#define PYTHON_BINDINGS_H
+
+#include <functional>
+
+#include "../loadgen.h"
+#include "../query_sample.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+#include "pybind11/functional.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "pybind11/stl_bind.h"
+
+namespace mlperf {
+
+namespace {
+
+using IssueQueryCallback = std::function<void(std::vector<QuerySample>)>;
+using FastIssueQueriesCallback =
+    std::function<void(std::vector<ResponseId>, std::vector<QuerySampleIndex>)>;
+using FlushQueriesCallback = std::function<void()>;
+using ReportLatencyResultsCallback = std::function<void(std::vector<int64_t>)>;
+
+// Forwards SystemUnderTest calls to relevant callbacks.
+class SystemUnderTestTrampoline : public SystemUnderTest {
+ public:
+  SystemUnderTestTrampoline(
+      std::string name, IssueQueryCallback issue_cb,
+      FlushQueriesCallback flush_queries_cb,
+      ReportLatencyResultsCallback report_latency_results_cb)
+      : name_(std::move(name)),
+        issue_cb_(issue_cb),
+        flush_queries_cb_(flush_queries_cb),
+        report_latency_results_cb_(report_latency_results_cb) {}
+  ~SystemUnderTestTrampoline() override = default;
+
+  const std::string& Name() const override { return name_; }
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    issue_cb_(samples);
+  }
+
+  void FlushQueries() override { flush_queries_cb_(); }
+
+  void ReportLatencyResults(
+      const std::vector<QuerySampleLatency>& latencies_ns) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    report_latency_results_cb_(latencies_ns);
+  }
+
+ protected:
+  std::string name_;
+  IssueQueryCallback issue_cb_;
+  FlushQueriesCallback flush_queries_cb_;
+  ReportLatencyResultsCallback report_latency_results_cb_;
+};
+
+class FastSystemUnderTestTrampoline : public SystemUnderTestTrampoline {
+ public:
+  FastSystemUnderTestTrampoline(
+      std::string name, FastIssueQueriesCallback fast_issue_cb,
+      FlushQueriesCallback flush_queries_cb,
+      ReportLatencyResultsCallback report_latency_results_cb)
+      : SystemUnderTestTrampoline(name, nullptr, flush_queries_cb,
+                                  report_latency_results_cb),
+        fast_issue_cb_(fast_issue_cb) {}
+  ~FastSystemUnderTestTrampoline() override = default;
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    std::vector<ResponseId> responseIds;
+    std::vector<QuerySampleIndex> querySampleIndices;
+    for (auto& s : samples) {
+      responseIds.push_back(s.id);
+      querySampleIndices.push_back(s.index);
+    }
+    fast_issue_cb_(responseIds, querySampleIndices);
+  }
+
+  private:
+   FastIssueQueriesCallback fast_issue_cb_;
+};
+
+using LoadSamplesToRamCallback =
+    std::function<void(std::vector<QuerySampleIndex>)>;
+using UnloadSamplesFromRamCallback =
+    std::function<void(std::vector<QuerySampleIndex>)>;
+
+// Forwards QuerySampleLibrary calls to relevant callbacks.
+class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
+ public:
+  QuerySampleLibraryTrampoline(
+      std::string name, size_t total_sample_count,
+      size_t performance_sample_count,
+      LoadSamplesToRamCallback load_samples_to_ram_cb,
+      UnloadSamplesFromRamCallback unload_samples_from_ram_cb)
+      : name_(std::move(name)),
+        total_sample_count_(total_sample_count),
+        performance_sample_count_(performance_sample_count),
+        load_samples_to_ram_cb_(load_samples_to_ram_cb),
+        unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {}
+  ~QuerySampleLibraryTrampoline() override = default;
+
+  const std::string& Name() const override { return name_; }
+  size_t TotalSampleCount() { return total_sample_count_; }
+  size_t PerformanceSampleCount() { return performance_sample_count_; }
+
+  void LoadSamplesToRam(const std::vector<QuerySampleIndex>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    load_samples_to_ram_cb_(samples);
+  }
+  void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    unload_samples_from_ram_cb_(samples);
+  }
+
+ private:
+  std::string name_;
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  LoadSamplesToRamCallback load_samples_to_ram_cb_;
+  UnloadSamplesFromRamCallback unload_samples_from_ram_cb_;
+};
+
+}  // namespace
+
+/// \brief Python bindings.
+namespace py {
+
+uintptr_t ConstructSUT(IssueQueryCallback issue_cb,
+                       FlushQueriesCallback flush_queries_cb,
+                       ReportLatencyResultsCallback report_latency_results_cb) {
+  SystemUnderTestTrampoline* sut = new SystemUnderTestTrampoline(
+      "PySUT", issue_cb, flush_queries_cb, report_latency_results_cb);
+  return reinterpret_cast<uintptr_t>(sut);
+}
+
+void DestroySUT(uintptr_t sut) {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+uintptr_t ConstructFastSUT(
+    FastIssueQueriesCallback fast_issue_cb,
+    FlushQueriesCallback flush_queries_cb,
+    ReportLatencyResultsCallback report_latency_results_cb) {
+  FastSystemUnderTestTrampoline* sut = new FastSystemUnderTestTrampoline(
+      "PyFastSUT", fast_issue_cb, flush_queries_cb, report_latency_results_cb);
+  return reinterpret_cast<uintptr_t>(sut);
+}
+
+void DestroyFastSUT(uintptr_t sut) {
+  FastSystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<FastSystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+
+uintptr_t ConstructQSL(
+    size_t total_sample_count, size_t performance_sample_count,
+    LoadSamplesToRamCallback load_samples_to_ram_cb,
+    UnloadSamplesFromRamCallback unload_samples_from_ram_cb) {
+  QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline(
+      "PyQSL", total_sample_count, performance_sample_count,
+      load_samples_to_ram_cb, unload_samples_from_ram_cb);
+  return reinterpret_cast<uintptr_t>(qsl);
+}
+
+void DestroyQSL(uintptr_t qsl) {
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  delete qsl_cast;
+}
+
+void StartTest(uintptr_t sut, uintptr_t qsl,
+               mlperf::TestSettings test_settings) {
+  pybind11::gil_scoped_release gil_releaser;
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  LogSettings default_log_settings;
+  mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings);
+}
+
+void StartTestWithLogSettings(uintptr_t sut, uintptr_t qsl,
+                              mlperf::TestSettings test_settings,
+                              mlperf::LogSettings log_settings) {
+  pybind11::gil_scoped_release gil_releaser;
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  mlperf::StartTest(sut_cast, qsl_cast, test_settings, log_settings);
+}
+
+using ResponseCallback = std::function<void(QuerySampleResponse*)>;
+
+/// TODO: Get rid of copies.
+void QuerySamplesComplete(std::vector<QuerySampleResponse> responses, ResponseCallback response_cb = {}) {
+  pybind11::gil_scoped_release gil_releaser;
+  mlperf::QuerySamplesComplete(responses.data(), responses.size(), response_cb);
+}
+
+PYBIND11_MODULE(mlperf_loadgen, m) {
+  m.doc() = "MLPerf Inference load generator.";
+
+  pybind11::enum_<TestScenario>(m, "TestScenario")
+      .value("SingleStream", TestScenario::SingleStream)
+      .value("MultiStream", TestScenario::MultiStream)
+      .value("MultiStreamFree", TestScenario::MultiStreamFree)
+      .value("Server", TestScenario::Server)
+      .value("Offline", TestScenario::Offline);
+
+  pybind11::enum_<TestMode>(m, "TestMode")
+      .value("SubmissionRun", TestMode::SubmissionRun)
+      .value("AccuracyOnly", TestMode::AccuracyOnly)
+      .value("PerformanceOnly", TestMode::PerformanceOnly)
+      .value("FindPeakPerformance", TestMode::FindPeakPerformance);
+
+  pybind11::class_<TestSettings>(m, "TestSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("scenario", &TestSettings::scenario)
+      .def_readwrite("mode", &TestSettings::mode)
+      .def_readwrite("single_stream_expected_latency_ns",
+                     &TestSettings::single_stream_expected_latency_ns)
+      .def_readwrite("single_stream_target_latency_percentile",
+                     &TestSettings::single_stream_target_latency_percentile)
+      .def_readwrite("multi_stream_target_qps",
+                     &TestSettings::multi_stream_target_qps)
+      .def_readwrite("multi_stream_target_latency_ns",
+                     &TestSettings::multi_stream_target_latency_ns)
+      .def_readwrite("multi_stream_target_latency_percentile",
+                     &TestSettings::multi_stream_target_latency_percentile)
+      .def_readwrite("multi_stream_samples_per_query",
+                     &TestSettings::multi_stream_samples_per_query)
+      .def_readwrite("multi_stream_max_async_queries",
+                     &TestSettings::multi_stream_max_async_queries)
+      .def_readwrite("server_target_qps", &TestSettings::server_target_qps)
+      .def_readwrite("server_target_latency_ns",
+                     &TestSettings::server_target_latency_ns)
+      .def_readwrite("server_target_latency_percentile",
+                     &TestSettings::server_target_latency_percentile)
+      .def_readwrite("server_coalesce_queries",
+                     &TestSettings::server_coalesce_queries)
+      .def_readwrite("server_find_peak_qps_decimals_of_precision",
+                     &TestSettings::server_find_peak_qps_decimals_of_precision)
+      .def_readwrite("server_find_peak_qps_boundary_step_size",
+                     &TestSettings::server_find_peak_qps_boundary_step_size)
+      .def_readwrite("server_max_async_queries",
+                     &TestSettings::server_max_async_queries)
+      .def_readwrite("offline_expected_qps",
+                     &TestSettings::offline_expected_qps)
+      .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
+      .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
+      .def_readwrite("min_query_count", &TestSettings::min_query_count)
+      .def_readwrite("max_query_count", &TestSettings::max_query_count)
+      .def_readwrite("qsl_rng_seed", &TestSettings::qsl_rng_seed)
+      .def_readwrite("sample_index_rng_seed",
+                     &TestSettings::sample_index_rng_seed)
+      .def_readwrite("schedule_rng_seed", &TestSettings::schedule_rng_seed)
+      .def_readwrite("accuracy_log_rng_seed",
+                     &TestSettings::accuracy_log_rng_seed)
+      .def_readwrite("accuracy_log_probability",
+                     &TestSettings::accuracy_log_probability)
+      .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
+      .def_readwrite("performance_issue_unique",
+                     &TestSettings::performance_issue_unique)
+      .def_readwrite("performance_issue_same",
+                     &TestSettings::performance_issue_same)
+      .def_readwrite("performance_issue_same_index",
+                     &TestSettings::performance_issue_same_index)
+      .def_readwrite("performance_sample_count_override",
+                     &TestSettings::performance_sample_count_override)
+      .def("FromConfig", &TestSettings::FromConfig, "FromConfig.");
+
+  pybind11::enum_<LoggingMode>(m, "LoggingMode")
+      .value("AsyncPoll", LoggingMode::AsyncPoll)
+      .value("EndOfTestOnly", LoggingMode::EndOfTestOnly)
+      .value("Synchronous", LoggingMode::Synchronous);
+
+  pybind11::class_<LogOutputSettings>(m, "LogOutputSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("outdir", &LogOutputSettings::outdir)
+      .def_readwrite("prefix", &LogOutputSettings::prefix)
+      .def_readwrite("suffix", &LogOutputSettings::suffix)
+      .def_readwrite("prefix_with_datetime",
+                     &LogOutputSettings::prefix_with_datetime)
+      .def_readwrite("copy_detail_to_stdout",
+                     &LogOutputSettings::copy_detail_to_stdout)
+      .def_readwrite("copy_summary_to_stdout",
+                     &LogOutputSettings::copy_summary_to_stdout);
+
+  pybind11::class_<LogSettings>(m, "LogSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("log_output", &LogSettings::log_output)
+      .def_readwrite("log_mode", &LogSettings::log_mode)
+      .def_readwrite("log_mode_async_poll_interval_ms",
+                     &LogSettings::log_mode_async_poll_interval_ms)
+      .def_readwrite("enable_trace", &LogSettings::enable_trace);
+
+  pybind11::class_<QuerySample>(m, "QuerySample")
+      .def(pybind11::init<>())
+      .def(pybind11::init<ResponseId, QuerySampleIndex>())
+      .def_readwrite("id", &QuerySample::id)
+      .def_readwrite("index", &QuerySample::index)
+      .def(pybind11::pickle(
+          [] (const QuerySample &qs) { // __getstate__
+         /*Return a tuple that fully encodes state of object*/
+         return pybind11::make_tuple(qs.id, qs.index);
+         },
+         [] (pybind11::tuple t) { // __setstate__
+         if (t.size() != 2)
+           throw std::runtime_error("Invalid state for QuerySample");
+         /* Create a new C++ instance*/
+         QuerySample q;
+         q.id = t[0].cast<uintptr_t>();
+         q.index = t[1].cast<size_t>();
+         return q;
+         }));
+
+  pybind11::class_<QuerySampleResponse>(m, "QuerySampleResponse")
+      .def(pybind11::init<>())
+      .def(pybind11::init<ResponseId, uintptr_t, size_t>())
+      .def_readwrite("id", &QuerySampleResponse::id)
+      .def_readwrite("data", &QuerySampleResponse::data)
+      .def_readwrite("size", &QuerySampleResponse::size)
+      .def(pybind11::pickle(
+       [] (const QuerySampleResponse &qsr) { // __getstate__
+        /* Return a tuple that fully encodes state of object*/
+        return pybind11::make_tuple(qsr.id, qsr.data, qsr.size);
+        },
+       [] (pybind11::tuple t) { // __setstate__
+       if (t.size() != 3)
+        throw std::runtime_error("Invalid state for QuerySampleResponse");
+       /* Create a new C++ instance*/
+       QuerySampleResponse q;
+       q.id   = t[0].cast<uintptr_t>();
+       q.data = t[1].cast<uintptr_t>();
+       q.size = t[2].cast<size_t>();
+       return q;
+       }));
+
+  // TODO: Use PYBIND11_MAKE_OPAQUE for the following vector types.
+  pybind11::bind_vector<std::vector<QuerySample>>(m, "VectorQuerySample");
+  pybind11::bind_vector<std::vector<QuerySampleResponse>>(
+      m, "VectorQuerySampleResponse");
+
+  m.def("ConstructSUT", &py::ConstructSUT, "Construct the system under test.");
+  m.def("DestroySUT", &py::DestroySUT,
+        "Destroy the object created by ConstructSUT.");
+
+  m.def("ConstructFastSUT", &py::ConstructFastSUT,
+        "Construct the system under test, fast issue query");
+  m.def("DestroyFastSUT", &py::DestroyFastSUT,
+        "Destroy the object created by ConstructFastSUT.");
+
+  m.def("ConstructQSL", &py::ConstructQSL,
+        "Construct the query sample library.");
+  m.def("DestroyQSL", &py::DestroyQSL,
+        "Destroy the object created by ConstructQSL.");
+
+  m.def("StartTest", &py::StartTest,
+        "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
+        "Uses default log settings.");
+  m.def("StartTestWithLogSettings", &py::StartTestWithLogSettings,
+        "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
+        "Accepts custom log settings.");
+  m.def("QuerySamplesComplete", &py::QuerySamplesComplete,
+        "Called by the SUT to indicate that samples from some combination of"
+        "IssueQuery calls have finished.", pybind11::arg("responses"), pybind11::arg("response_cb") = ResponseCallback{});
+}
+
+}  // namespace py
+}  // namespace mlperf
+
+#endif  // PYTHON_BINDINGS_H
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream.py
new file mode 100644
index 0000000..141b27a
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+import numpy
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(.001 * (i_slice + 1))
+    responses = []
+    samples_to_complete = query_samples[i_slice:len(query_samples):3]
+    for s in samples_to_complete:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 0)).start()
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 1)).start()
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def process_latencies(latencies_ns):
+    print("Average latency: ")
+    print(numpy.mean(latencies_ns))
+    print("Median latency: ")
+    print(numpy.percentile(latencies_ns, 50))
+    print("90 percentile latency: ")
+    print(numpy.percentile(latencies_ns, 90))
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.MultiStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.multi_stream_target_latency_ns = 100000000
+    settings.multi_stream_samples_per_query = 4
+    settings.multi_stream_max_async_queries = 2
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(
+        issue_query, flush_queries, process_latencies)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream_free.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream_free.py
new file mode 100644
index 0000000..a603059
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_multi_stream_free.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+import numpy
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(.001 * (i_slice + 1))
+    responses = []
+    samples_to_complete = query_samples[i_slice:len(query_samples):3]
+    for s in samples_to_complete:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 0)).start()
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 1)).start()
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def process_latencies(latencies_ns):
+    print("Average latency: ")
+    print(numpy.mean(latencies_ns))
+    print("Median latency: ")
+    print(numpy.percentile(latencies_ns, 50))
+    print("90 percentile latency: ")
+    print(numpy.percentile(latencies_ns, 90))
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.MultiStreamFree
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.multi_stream_target_latency_ns = 100000000
+    settings.multi_stream_samples_per_query = 4
+    settings.multi_stream_max_async_queries = 2
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(
+        issue_query, flush_queries, process_latencies)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_offline.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_offline.py
new file mode 100644
index 0000000..c152530
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_offline.py
@@ -0,0 +1,88 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+import numpy
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(3 * (i_slice + 1))
+    responses = []
+    samples_to_complete = query_samples[i_slice:len(query_samples):3]
+    for s in samples_to_complete:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 0)).start()
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 1)).start()
+    threading.Thread(target=process_query_async,
+                     args=(query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def process_latencies(latencies_ns):
+    print("Average latency: ")
+    print(numpy.mean(latencies_ns))
+    print("Median latency: ")
+    print(numpy.percentile(latencies_ns, 50))
+    print("90 percentile latency: ")
+    print(numpy.percentile(latencies_ns, 90))
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Offline
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.offline_expected_qps = 1000
+
+    sut = mlperf_loadgen.ConstructSUT(
+        issue_query, flush_queries, process_latencies)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_server.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_server.py
new file mode 100644
index 0000000..75aa82f
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_server.py
@@ -0,0 +1,85 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+import numpy
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    time.sleep(.001)
+    responses = []
+    for s in query_samples:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async,
+                     args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def process_latencies(latencies_ns):
+    print("Average latency: ")
+    print(numpy.mean(latencies_ns))
+    print("Median latency: ")
+    print(numpy.percentile(latencies_ns, 50))
+    print("99 percentile latency: ")
+    print(numpy.percentile(latencies_ns, 99))
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.server_target_qps = 100
+    settings.server_target_latency_ns = 100000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(
+        issue_query, flush_queries, process_latencies)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_single_stream.py b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_single_stream.py
new file mode 100644
index 0000000..53efa42
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/demos/py_demo_single_stream.py
@@ -0,0 +1,93 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import array
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+import numpy
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    time.sleep(.001)
+    responses = []
+    response_array = array.array(
+        "f", [0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 254, 255])
+    response_info = response_array.buffer_info()
+    response_data = response_info[0]
+    response_size = response_info[1] * response_array.itemsize
+    for s in query_samples:
+        responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async,
+                     args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def process_latencies(latencies_ns):
+    print("Average latency: ")
+    print(numpy.mean(latencies_ns))
+    print("Median latency: ")
+    print(numpy.percentile(latencies_ns, 50))
+    print("90 percentile latency: ")
+    print(numpy.percentile(latencies_ns, 90))
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.single_stream_expected_latency_ns = 1000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(
+        issue_query, flush_queries, process_latencies)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/BUILD.gn b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/BUILD.gn
new file mode 100644
index 0000000..865bc4d
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/BUILD.gn
@@ -0,0 +1,33 @@
+generated_doxygen_out_dir =
+    get_path_info(".", "gen_dir") + "/.."
+
+loadgen_doxygen_sources = [
+  "doxygen.cfg",
+  "doxygen_footer.html",
+  "doxygen_header.html",
+  "doxygen_layout.xml",
+  "doxygen_stylesheet.css",
+  "loadgen-integration_diagram.dia",
+  "mlperf_icon.png",
+  "mlperf_logo_horizontal_color.svg",
+  "README.md"
+]
+
+source_set("loadgen_doxygen_sources") {
+  sources = loadgen_doxygen_sources
+}
+
+source_set("doxygen_html_generator_script") {
+  sources = [ "doxygen_html_generator.py" ]
+}
+
+action("generate_doxygen_html") {
+  script = "doxygen_html_generator.py"
+  args = [ rebase_path(generated_doxygen_out_dir, root_build_dir),
+           rebase_path("../..") ]
+  outputs = [ generated_doxygen_out_dir ]
+  deps = [ ":loadgen_doxygen_sources",
+           ":doxygen_html_generator_script",
+           "../..:mlperf_loadgen_sources_no_gen",
+           "../..:docs" ]
+}
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/README.md b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/README.md
new file mode 100644
index 0000000..d5cf5fe
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/README.md
@@ -0,0 +1,34 @@
+# Generating the HTML docs {#ReadmeHtmlDocs}
+
+This document is generated from inline docstrings in the source and
+various markdown files checked into the git repository. If you've
+checked out the code, you can generate this documentation.
+
+*Prerequisite:* You must have [doxygen](http://www.doxygen.nl) installed
+on your system:
+
+## With gn / ninja
+
+If you are using the gn build flow, you may run:
+
+    ninja -C out/Release generate_doxygen_html
+
+* This will output the documentation to out/Release/gen/loadgen/docs/gen and
+avoid poluting the source directory.
+
+## Manually
+
+Alternatively, you can manually run:
+
+    python docs/src/doxygen_html_generator.py <target_dir> <loadgen_root>
+
+* If <loadgen_root> is omitted, it will default to ".".
+* If <target_dir> is also omitted, it will default to "./docs/gen".
+
+## Hosting
+
+A version of this doc is currently hosted online at
+https://mlperf.github.io/inference/loadgen/index.html
+
+To update the hosted version, submit a PR to the
+[mlperf.github.io](https://github.com/mlperf/mlperf.github.io) repository.
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen.cfg b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen.cfg
new file mode 100644
index 0000000..fc05853
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen.cfg
@@ -0,0 +1,2495 @@
+# Doxyfile 1.8.13
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "LoadGen Guide"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_logo_horizontal_color.svg
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = $(MLPERF_DOXYGEN_OUT_PATH)
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = YES
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 1
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_layout.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = $(MLPERF_LOADGEN_SRC_PATH)
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = depot_tools
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = $(MLPERF_LOADGEN_SRC_PATH)/docs/src
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          = -I ../third_party/pybind/include --std=c++14
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot o=
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_header.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_footer.html
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_stylesheet.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_icon.png \
+                         $(MLPERF_LOADGEN_SRC_PATH)/loadgen_integration_diagram.svg
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 127
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 50
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = YES
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = YES
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = YES
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = YES
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = YES
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_footer.html b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_footer.html
new file mode 100644
index 0000000..0d7b177
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_footer.html
@@ -0,0 +1,26 @@
+<!-- HTML footer for doxygen 1.8.13-->
+<!-- start footer part -->
+<!--BEGIN GENERATE_TREEVIEW-->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    $navpath
+    <li class="footer">
+      <div style="flex-grow:1"></div>
+      <div>
+        $generatedby
+        <a href="http://www.doxygen.org/index.html">doxygen</a>
+        $doxygenversion
+      </div>
+    </li>
+  </ul>
+</div>
+<!--END GENERATE_TREEVIEW-->
+<!--BEGIN !GENERATE_TREEVIEW-->
+<hr class="footer"/><address class="footer"><small>
+$generatedby &#160;<a href="http://www.doxygen.org/index.html">
+doxygen
+</a> $doxygenversion
+</small></address>
+<!--END !GENERATE_TREEVIEW-->
+</body>
+</html>
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_header.html b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_header.html
new file mode 100644
index 0000000..91d214b
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_header.html
@@ -0,0 +1,49 @@
+<!-- HTML header for doxygen 1.8.13-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen $doxygenversion"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<link rel="shortcut icon" href="mlperf_icon.png">
+<!--BEGIN PROJECT_NAME--><title>LoadGen: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
+$treeview
+$search
+$mathjax
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
+$extrastylesheet
+</head>
+<body>
+<div id="top" style="display:flex; flex-flow:row wrap; justify-content:flex-start; align-items:center;"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea" style="display:flex; flex-flow:row wrap; justify-content:flex-start; align-items:center;">
+  <!--BEGIN PROJECT_LOGO-->
+  <a href="https://www.mlperf.org"><img alt="MLPerf" src="$relpath^$projectlogo"/ height="55px"></a>
+  <!--END PROJECT_LOGO-->
+  <!--BEGIN PROJECT_NAME-->
+  <div style="padding-left: 1em;">
+    <div id="projectname"><a href="index.html">$projectname</a>
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </div>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <div id="projectbrief" style="padding-left: 1em;">$projectbrief</div>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <div>$searchbox</div>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_html_generator.py b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_html_generator.py
new file mode 100644
index 0000000..405ac1e
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_html_generator.py
@@ -0,0 +1,37 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+## \file
+#  \brief A script that sets the environment variables expected by doxygen.cfg.
+#  \details This can be run manually without any arguments, but also allows a
+#  build system to customize the output directory.
+
+import os
+import sys
+
+
+def generate_doxygen_html(doxygen_out_dir, loadgen_root):
+    os.environ["MLPERF_LOADGEN_SRC_PATH"] = loadgen_root
+    os.environ["MLPERF_DOXYGEN_OUT_PATH"] = doxygen_out_dir
+    os.popen("doxygen " + loadgen_root + "/docs/src/doxygen.cfg")
+
+
+def main(argv):
+    doxygen_out_dir = "./docs/gen" if len(argv) < 2 else argv[1]
+    loadgen_root = "." if len(argv) < 3 else argv[2]
+    generate_doxygen_html(doxygen_out_dir, loadgen_root)
+
+
+main(sys.argv)
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_layout.xml b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_layout.xml
new file mode 100644
index 0000000..1fc5a9c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_layout.xml
@@ -0,0 +1,211 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.13 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title="Overview"/>
+    <tab type="usergroup" visible="yes" title="API and Usage">
+      <tab type="user" url="@ref LoadgenAPI" visible="yes" title="API"/>
+      <tab type="user" url="@ref LoadgenAPITestSettings" visible="yes" title="Settings"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="Development">
+      <tab type="user" url="@ref ReadmeBuild" visible="yes" title="Building"/>
+      <tab type="user" url="@ref ReadmeHtmlDocs" visible="yes" title="Generating this Document"/>
+      <tab type="user" url="@ref todo" visible="yes" title="TODOs"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="Test Coverage">
+      <tab type="user" url="@ref ReadmeTests" visible="yes" title="Build and Run"/>
+      <tab type="user" url="@ref LoadgenTestsBasic" visible="yes" title="Basic"/>
+      <tab type="user" url="@ref LoadgenTestsPerformance" visible="yes" title="Performance"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="The Code">
+      <tab type="modules" visible="no" title="" intro=""/>
+      <tab type="namespaces" visible="yes" title="">
+        <tab type="namespacelist" visible="yes" title="" intro=""/>
+        <tab type="namespacemembers" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="classes" visible="yes" title="">
+        <tab type="classlist" visible="yes" title="" intro=""/>
+        <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+        <tab type="hierarchy" visible="yes" title="" intro=""/>
+        <tab type="classmembers" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="files" visible="yes" title="">
+        <tab type="filelist" visible="yes" title="" intro=""/>
+        <tab type="globals" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="examples" visible="no" title="" intro=""/>
+    </tab>
+    <tab type="pages" visible="no" title="" intro=""/>
+    <tab type="user" url="@ref ReadmeFAQ" visible="yes" title="FAQ"/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_stylesheet.css b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_stylesheet.css
new file mode 100644
index 0000000..3bd6126
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/doxygen_stylesheet.css
@@ -0,0 +1,1629 @@
+/* The standard CSS for doxygen 1.8.13 */
+
+body, table, div, p, dl {
+	font: 400 14px/22px Roboto,sans-serif;
+}
+
+p.reference, p.definition {
+	font: 400 14px/22px Roboto,sans-serif;
+}
+
+/* @group Heading Levels */
+
+h1.groupheader {
+	font-size: 150%;
+}
+
+.title {
+	font: 400 14px/28px Roboto,sans-serif;
+	font-size: 175%;
+	font-weight: bold;
+	margin: 10px 2px;
+	color: #135384;
+}
+
+h2.groupheader {
+	border-bottom: 1px solid #879ECB;
+	color: #354C7B;
+	font-size: 150%;
+	font-weight: normal;
+	margin-top: 1.75em;
+	padding-top: 8px;
+	padding-bottom: 4px;
+	width: 100%;
+}
+
+h3.groupheader {
+	font-size: 100%;
+}
+
+h1, h2, h3, h4, h5, h6 {
+	-webkit-transition: text-shadow 0.5s linear;
+	-moz-transition: text-shadow 0.5s linear;
+	-ms-transition: text-shadow 0.5s linear;
+	-o-transition: text-shadow 0.5s linear;
+	transition: text-shadow 0.5s linear;
+	margin-right: 15px;
+	color: #135384;
+
+}
+
+h1.glow, h2.glow, h3.glow, h4.glow, h5.glow, h6.glow {
+	text-shadow: 0 0 15px cyan;
+}
+
+dt {
+	font-weight: bold;
+}
+
+div.multicol {
+	-moz-column-gap: 1em;
+	-webkit-column-gap: 1em;
+	-moz-column-count: 3;
+	-webkit-column-count: 3;
+}
+
+p.startli, p.startdd {
+	margin-top: 2px;
+}
+
+p.starttd {
+	margin-top: 0px;
+}
+
+p.endli {
+	margin-bottom: 0px;
+}
+
+p.enddd {
+	margin-bottom: 4px;
+}
+
+p.endtd {
+	margin-bottom: 2px;
+}
+
+/* @end */
+
+caption {
+	font-weight: bold;
+}
+
+span.legend {
+        font-size: 70%;
+        text-align: center;
+}
+
+h3.version {
+        font-size: 90%;
+        text-align: center;
+}
+
+div.qindex, div.navtab{
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+}
+
+div.qindex, div.navpath {
+	width: 100%;
+	line-height: 140%;
+}
+
+div.navtab {
+	margin-right: 15px;
+}
+
+/* @group Link Styling */
+
+a {
+	color: #3D578C;
+	font-weight: normal;
+	text-decoration: none;
+}
+
+.contents a:visited {
+	color: #4665A2;
+}
+
+a:hover {
+	text-decoration: underline;
+}
+
+a.qindex {
+	font-weight: bold;
+}
+
+a.qindexHL {
+	font-weight: bold;
+	background-color: #9CAFD4;
+	color: #ffffff;
+	border: 1px double #869DCA;
+}
+
+.contents a.qindexHL:visited {
+        color: #ffffff;
+}
+
+a.el {
+	font-weight: bold;
+}
+
+a.elRef {
+}
+
+a.code, a.code:visited, a.line, a.line:visited {
+	color: #4665A2; 
+}
+
+a.codeRef, a.codeRef:visited, a.lineRef, a.lineRef:visited {
+	color: #4665A2; 
+}
+
+/* @end */
+
+dl.el {
+	margin-left: -1cm;
+}
+
+pre.fragment {
+        border: 1px solid #C4CFE5;
+        background-color: #FBFCFD;
+        padding: 4px 6px;
+        margin: 4px 8px 4px 2px;
+        overflow: auto;
+        word-wrap: break-word;
+        font-size:  9pt;
+        line-height: 125%;
+        font-family: monospace, fixed;
+        font-size: 105%;
+}
+
+div.fragment {
+        padding: 0px;
+        margin: 4px 8px 4px 2px;
+	background-color: #FBFCFD;
+	border: 1px solid #C4CFE5;
+}
+
+div.line {
+	font-family: monospace, fixed;
+        font-size: 13px;
+	min-height: 13px;
+	line-height: 1.0;
+	text-wrap: unrestricted;
+	white-space: -moz-pre-wrap; /* Moz */
+	white-space: -pre-wrap;     /* Opera 4-6 */
+	white-space: -o-pre-wrap;   /* Opera 7 */
+	white-space: pre-wrap;      /* CSS3  */
+	word-wrap: break-word;      /* IE 5.5+ */
+	text-indent: -53px;
+	padding-left: 53px;
+	padding-bottom: 0px;
+	margin: 0px;
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+div.line:after {
+    content:"\000A";
+    white-space: pre;
+}
+
+div.line.glow {
+	background-color: cyan;
+	box-shadow: 0 0 10px cyan;
+}
+
+
+span.lineno {
+	padding-right: 4px;
+	text-align: right;
+	border-right: 2px solid #0F0;
+	background-color: #E8E8E8;
+        white-space: pre;
+}
+span.lineno a {
+	background-color: #D8D8D8;
+}
+
+span.lineno a:hover {
+	background-color: #C8C8C8;
+}
+
+.lineno {
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+div.ah, span.ah {
+	background-color: black;
+	font-weight: bold;
+	color: #ffffff;
+	margin-bottom: 3px;
+	margin-top: 3px;
+	padding: 0.2em;
+	border: solid thin #333;
+	border-radius: 0.5em;
+	-webkit-border-radius: .5em;
+	-moz-border-radius: .5em;
+	box-shadow: 2px 2px 3px #999;
+	-webkit-box-shadow: 2px 2px 3px #999;
+	-moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+	background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444));
+	background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000 110%);
+}
+
+div.classindex ul {
+        list-style: none;
+        padding-left: 0;
+}
+
+div.classindex span.ai {
+        display: inline-block;
+}
+
+div.groupHeader {
+	margin-left: 16px;
+	margin-top: 12px;
+	font-weight: bold;
+}
+
+div.groupText {
+	margin-left: 16px;
+	font-style: italic;
+}
+
+body {
+	background-color: white;
+	color: black;
+        margin: 0;
+}
+
+div.contents {
+	margin-top: 10px;
+	margin-left: 12px;
+	margin-right: 8px;
+}
+
+td.indexkey {
+	background-color: #EBEFF6;
+	font-weight: bold;
+	border: 1px solid #C4CFE5;
+	margin: 2px 0px 2px 0;
+	padding: 2px 10px;
+        white-space: nowrap;
+        vertical-align: top;
+}
+
+td.indexvalue {
+	background-color: #EBEFF6;
+	border: 1px solid #C4CFE5;
+	padding: 2px 10px;
+	margin: 2px 0px;
+}
+
+tr.memlist {
+	background-color: #EEF1F7;
+}
+
+p.formulaDsp {
+	text-align: center;
+}
+
+img.formulaDsp {
+	
+}
+
+img.formulaInl {
+	vertical-align: middle;
+}
+
+div.center {
+	text-align: center;
+        margin-top: 0px;
+        margin-bottom: 0px;
+        padding: 0px;
+}
+
+div.center img {
+	border: 0px;
+}
+
+address.footer {
+	text-align: right;
+	padding-right: 12px;
+}
+
+img.footer {
+	border: 0px;
+	vertical-align: middle;
+}
+
+/* @group Code Colorization */
+
+span.keyword {
+	color: #008000
+}
+
+span.keywordtype {
+	color: #604020
+}
+
+span.keywordflow {
+	color: #e08000
+}
+
+span.comment {
+	color: #800000
+}
+
+span.preprocessor {
+	color: #806020
+}
+
+span.stringliteral {
+	color: #002080
+}
+
+span.charliteral {
+	color: #008080
+}
+
+span.vhdldigit { 
+	color: #ff00ff 
+}
+
+span.vhdlchar { 
+	color: #000000 
+}
+
+span.vhdlkeyword { 
+	color: #700070 
+}
+
+span.vhdllogic { 
+	color: #ff0000 
+}
+
+blockquote {
+        background-color: #F7F8FB;
+        border-left: 2px solid #9CAFD4;
+        margin: 0 24px 0 4px;
+        padding: 0 12px 0 16px;
+}
+
+/* @end */
+
+/*
+.search {
+	color: #003399;
+	font-weight: bold;
+}
+
+form.search {
+	margin-bottom: 0px;
+	margin-top: 0px;
+}
+
+input.search {
+	font-size: 75%;
+	color: #000080;
+	font-weight: normal;
+	background-color: #e8eef2;
+}
+*/
+
+td.tiny {
+	font-size: 75%;
+}
+
+.dirtab {
+	padding: 4px;
+	border-collapse: collapse;
+	border: 1px solid #A3B4D7;
+}
+
+th.dirtab {
+	background: #EBEFF6;
+	font-weight: bold;
+}
+
+hr {
+	height: 0px;
+	border: none;
+	border-top: 1px solid #4A6AAA;
+}
+
+hr.footer {
+	height: 1px;
+}
+
+/* @group Member Descriptions */
+
+table.memberdecls {
+	border-spacing: 0px;
+	padding: 0px;
+}
+
+.memberdecls td, .fieldtable tr {
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+.memberdecls td.glow, .fieldtable tr.glow {
+	background-color: cyan;
+	box-shadow: 0 0 15px cyan;
+}
+
+.mdescLeft, .mdescRight,
+.memItemLeft, .memItemRight,
+.memTemplItemLeft, .memTemplItemRight, .memTemplParams {
+	background-color: #F9FAFC;
+	border: none;
+	margin: 4px;
+	padding: 1px 0 0 8px;
+}
+
+.mdescLeft, .mdescRight {
+	padding: 0px 8px 4px 8px;
+	color: #555;
+}
+
+.memSeparator {
+        border-bottom: 1px solid #DEE4F0;
+        line-height: 1px;
+        margin: 0px;
+        padding: 0px;
+}
+
+.memItemLeft, .memTemplItemLeft {
+        white-space: nowrap;
+}
+
+.memItemRight {
+	width: 100%;
+}
+
+.memTemplParams {
+	color: #4665A2;
+        white-space: nowrap;
+	font-size: 80%;
+}
+
+/* @end */
+
+/* @group Member Details */
+
+/* Styles for detailed member documentation */
+
+.memtitle {
+	padding: 8px;
+	border-top: 1px solid #A8B8D9;
+	border-left: 1px solid #A8B8D9;
+	border-right: 1px solid #A8B8D9;
+	border-top-right-radius: 4px;
+	border-top-left-radius: 4px;
+	margin-bottom: -1px;
+	background-image: url('nav_f.png');
+	background-repeat: repeat-x;
+	background-color: #E2E8F2;
+	line-height: 1.25;
+	font-weight: 300;
+	float:left;
+}
+
+.permalink
+{
+        font-size: 65%;
+        display: inline-block;
+        vertical-align: middle;
+}
+
+.memtemplate {
+	font-size: 80%;
+	color: #4665A2;
+	font-weight: normal;
+	margin-left: 9px;
+}
+
+.memnav {
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+	margin: 2px;
+	margin-right: 15px;
+	padding: 2px;
+}
+
+.mempage {
+	width: 100%;
+}
+
+.memitem {
+	padding: 0;
+	margin-bottom: 10px;
+	margin-right: 5px;
+        -webkit-transition: box-shadow 0.5s linear;
+        -moz-transition: box-shadow 0.5s linear;
+        -ms-transition: box-shadow 0.5s linear;
+        -o-transition: box-shadow 0.5s linear;
+        transition: box-shadow 0.5s linear;
+        display: table !important;
+        width: 100%;
+}
+
+.memitem.glow {
+         box-shadow: 0 0 15px cyan;
+}
+
+.memname {
+        font-weight: 400;
+        margin-left: 6px;
+}
+
+.memname td {
+	vertical-align: bottom;
+}
+
+.memproto, dl.reflist dt {
+        border-top: 1px solid #A8B8D9;
+        border-left: 1px solid #A8B8D9;
+        border-right: 1px solid #A8B8D9;
+        padding: 6px 0px 6px 0px;
+        color: #253555;
+        font-weight: bold;
+        text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+        background-color: #DFE5F1;
+        /* opera specific markup */
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        border-top-right-radius: 4px;
+        /* firefox specific markup */
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        -moz-border-radius-topright: 4px;
+        /* webkit specific markup */
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        -webkit-border-top-right-radius: 4px;
+
+}
+
+.overload {
+        font-family: "courier new",courier,monospace;
+	font-size: 65%;
+}
+
+.memdoc, dl.reflist dd {
+        border-bottom: 1px solid #A8B8D9;      
+        border-left: 1px solid #A8B8D9;      
+        border-right: 1px solid #A8B8D9; 
+        padding: 6px 10px 2px 10px;
+        background-color: #FBFCFD;
+        border-top-width: 0;
+        background-image:url('nav_g.png');
+        background-repeat:repeat-x;
+        background-color: #FFFFFF;
+        /* opera specific markup */
+        border-bottom-left-radius: 4px;
+        border-bottom-right-radius: 4px;
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        /* firefox specific markup */
+        -moz-border-radius-bottomleft: 4px;
+        -moz-border-radius-bottomright: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        /* webkit specific markup */
+        -webkit-border-bottom-left-radius: 4px;
+        -webkit-border-bottom-right-radius: 4px;
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+}
+
+dl.reflist dt {
+        padding: 5px;
+}
+
+dl.reflist dd {
+        margin: 0px 0px 10px 0px;
+        padding: 5px;
+}
+
+.paramkey {
+	text-align: right;
+}
+
+.paramtype {
+	white-space: nowrap;
+}
+
+.paramname {
+	color: #602020;
+	white-space: nowrap;
+}
+.paramname em {
+	font-style: normal;
+}
+.paramname code {
+        line-height: 14px;
+}
+
+.params, .retval, .exception, .tparams {
+        margin-left: 0px;
+        padding-left: 0px;
+}       
+
+.params .paramname, .retval .paramname {
+        font-weight: bold;
+        vertical-align: top;
+}
+        
+.params .paramtype {
+        font-style: italic;
+        vertical-align: top;
+}       
+        
+.params .paramdir {
+        font-family: "courier new",courier,monospace;
+        vertical-align: top;
+}
+
+table.mlabels {
+	border-spacing: 0px;
+}
+
+td.mlabels-left {
+	width: 100%;
+	padding: 0px;
+}
+
+td.mlabels-right {
+	vertical-align: bottom;
+	padding: 0px;
+	white-space: nowrap;
+}
+
+span.mlabels {
+        margin-left: 8px;
+}
+
+span.mlabel {
+        background-color: #728DC1;
+        border-top:1px solid #5373B4;
+        border-left:1px solid #5373B4;
+        border-right:1px solid #C4CFE5;
+        border-bottom:1px solid #C4CFE5;
+	text-shadow: none;
+	color: white;
+	margin-right: 4px;
+	padding: 2px 3px;
+	border-radius: 3px;
+	font-size: 7pt;
+	white-space: nowrap;
+	vertical-align: middle;
+}
+
+
+
+/* @end */
+
+/* these are for tree view inside a (index) page */
+
+div.directory {
+        margin: 10px 0px;
+        border-top: 1px solid #9CAFD4;
+        border-bottom: 1px solid #9CAFD4;
+        width: 100%;
+}
+
+.directory table {
+        border-collapse:collapse;
+}
+
+.directory td {
+        margin: 0px;
+        padding: 0px;
+	vertical-align: top;
+}
+
+.directory td.entry {
+        white-space: nowrap;
+        padding-right: 6px;
+	padding-top: 3px;
+}
+
+.directory td.entry a {
+        outline:none;
+}
+
+.directory td.entry a img {
+        border: none;
+}
+
+.directory td.desc {
+        width: 100%;
+        padding-left: 6px;
+	padding-right: 6px;
+	padding-top: 3px;
+	border-left: 1px solid rgba(0,0,0,0.05);
+}
+
+.directory tr.even {
+	padding-left: 6px;
+	background-color: #F7F8FB;
+}
+
+.directory img {
+	vertical-align: -30%;
+}
+
+.directory .levels {
+        white-space: nowrap;
+        width: 100%;
+        text-align: right;
+        font-size: 9pt;
+}
+
+.directory .levels span {
+        cursor: pointer;
+        padding-left: 2px;
+        padding-right: 2px;
+	color: #3D578C;
+}
+
+.arrow {
+    color: #9CAFD4;
+    -webkit-user-select: none;
+    -khtml-user-select: none;
+    -moz-user-select: none;
+    -ms-user-select: none;
+    user-select: none;
+    cursor: pointer;
+    font-size: 80%;
+    display: inline-block;
+    width: 16px;
+    height: 22px;
+}
+
+.icon {
+    font-family: Arial, Helvetica;
+    font-weight: bold;
+    font-size: 12px;
+    height: 14px;
+    width: 16px;
+    display: inline-block;
+    background-color: #728DC1;
+    color: white;
+    text-align: center;
+    border-radius: 4px;
+    margin-left: 2px;
+    margin-right: 2px;
+}
+
+.icona {
+    width: 24px;
+    height: 22px;
+    display: inline-block;
+}
+
+.iconfopen {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('folderopen.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+.iconfclosed {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('folderclosed.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+.icondoc {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('doc.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+table.directory {
+    font: 400 14px Roboto,sans-serif;
+}
+
+/* @end */
+
+div.dynheader {
+        margin-top: 8px;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+address {
+	font-style: normal;
+	color: #2A3D61;
+}
+
+table.doxtable caption {
+	caption-side: top;
+}
+
+table.doxtable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.doxtable td, table.doxtable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.doxtable th {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+table.fieldtable {
+        /*width: 100%;*/
+        margin-bottom: 10px;
+        border: 1px solid #A8B8D9;
+        border-spacing: 0px;
+        -moz-border-radius: 4px;
+        -webkit-border-radius: 4px;
+        border-radius: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+        -webkit-box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+        box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+}
+
+.fieldtable td, .fieldtable th {
+        padding: 3px 7px 2px;
+}
+
+.fieldtable td.fieldtype, .fieldtable td.fieldname {
+        white-space: nowrap;
+        border-right: 1px solid #A8B8D9;
+        border-bottom: 1px solid #A8B8D9;
+        vertical-align: top;
+}
+
+.fieldtable td.fieldname {
+        padding-top: 3px;
+}
+
+.fieldtable td.fielddoc {
+        border-bottom: 1px solid #A8B8D9;
+        /*width: 100%;*/
+}
+
+.fieldtable td.fielddoc p:first-child {
+        margin-top: 0px;
+}       
+        
+.fieldtable td.fielddoc p:last-child {
+        margin-bottom: 2px;
+}
+
+.fieldtable tr:last-child td {
+        border-bottom: none;
+}
+
+.fieldtable th {
+        background-image:url('nav_f.png');
+        background-repeat:repeat-x;
+        background-color: #E2E8F2;
+        font-size: 90%;
+        color: #253555;
+        padding-bottom: 4px;
+        padding-top: 5px;
+        text-align:left;
+        font-weight: 400;
+        -moz-border-radius-topleft: 4px;
+        -moz-border-radius-topright: 4px;
+        -webkit-border-top-left-radius: 4px;
+        -webkit-border-top-right-radius: 4px;
+        border-top-left-radius: 4px;
+        border-top-right-radius: 4px;
+        border-bottom: 1px solid #A8B8D9;
+}
+
+
+.tabsearch {
+	top: 0px;
+	left: 10px;
+	height: 36px;
+	background-image: url('tab_b.png');
+	z-index: 101;
+	overflow: hidden;
+	font-size: 13px;
+}
+
+.navpath ul {
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+	font-size: 11px;
+	background-image:none;
+	background-repeat:repeat-x;
+	background-position: 0 -5px;
+	height:auto;
+	line-height:30px;
+	color:#8AA0CC;
+	border:solid 1px #C2CDE4;
+	overflow:hidden;
+	margin:0px;
+	padding:0px;
+}
+
+.navpath li
+{
+	list-style-type:none;
+	float:left;
+	padding-left:10px;
+	padding-right:15px;
+	background-image:url('bc_s.png');
+	background-repeat:no-repeat;
+	background-position:right;
+	color:#364D7C;
+}
+
+.navpath li.navelem a
+{
+	height:32px;
+	display:block;
+	text-decoration: none;
+	outline: none;
+	color: #283A5D;
+	font-family: 'Lucida Grande',Geneva,Helvetica,Arial,sans-serif;
+	text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+	text-decoration: none;        
+}
+
+.navpath li.navelem a:hover
+{
+	color:#6884BD;
+}
+
+.navpath li.footer
+{
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+        flex-grow: 1;
+        list-style-type:none;
+        float:none;
+        padding-left:10px;
+        padding-right:15px;
+        background-image:none;
+        background-repeat:no-repeat;
+        background-position:right;
+        color:#364D7C;
+        font-size: 8pt;
+}
+
+div.summary
+{
+	float: right;
+	font-size: 8pt;
+	padding-right: 5px;
+	width: 50%;
+	text-align: right;
+}       
+
+div.summary a
+{
+	white-space: nowrap;
+}
+
+table.classindex
+{
+        margin: 10px;
+        white-space: nowrap;
+        margin-left: 3%;
+        margin-right: 3%;
+        width: 94%;
+        border: 0;
+        border-spacing: 0; 
+        padding: 0;
+}
+
+div.ingroups
+{
+	font-size: 8pt;
+	width: 50%;
+	text-align: left;
+}
+
+div.ingroups a
+{
+	white-space: nowrap;
+}
+
+div.header
+{
+        background-image:url('nav_h.png');
+        background-repeat:repeat-x;
+	background-color: #F9FAFC;
+	margin:  0px;
+	border-bottom: 1px solid #C4CFE5;
+}
+
+div.headertitle
+{
+	padding: 5px 5px 5px 10px;
+       	color: #135384;
+}
+
+dl
+{
+        padding: 0 0 0 10px;
+}
+
+/* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug */
+dl.section
+{
+	margin-left: 0px;
+	padding-left: 0px;
+}
+
+dl.note
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #D0C000;
+}
+
+dl.warning, dl.attention
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #FF0000;
+}
+
+dl.pre, dl.post, dl.invariant
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00D000;
+}
+
+dl.deprecated
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #505050;
+}
+
+dl.todo
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00C0E0;
+}
+
+dl.test
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #3030E0;
+}
+
+dl.bug
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #C08050;
+}
+
+dl.section dd {
+	margin-bottom: 6px;
+}
+
+
+#projectlogo
+{
+	text-align: center;
+	vertical-align: bottom;
+	border-collapse: separate;
+}
+ 
+#projectlogo img
+{ 
+	border: 0px none;
+}
+ 
+#projectalign
+{
+        vertical-align: middle;
+}
+
+#projectname
+{
+	font: 200% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 2px 0px;
+}
+    
+#projectbrief
+{
+	font: 120% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 0px;
+}
+
+#projectnumber
+{
+	font: 50% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 0px;
+}
+
+#top {
+	border-bottom: 1px solid #5373B4;
+}
+
+#titlearea
+{
+	flex-grow: 1;
+	padding: 0px;
+	margin: 0px;
+	width: auto;
+	border-bottom: none;
+}
+
+#main-nav {
+}
+
+#main-menu {
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+	background-image: none;
+	min-width: 770px;
+}
+
+.ui-resizable-e {
+	height: 100%;
+	background-repeat: repeat-y;
+}
+
+.image
+{
+        text-align: center;
+}
+
+.dotgraph
+{
+        text-align: center;
+}
+
+.mscgraph
+{
+        text-align: center;
+}
+
+.plantumlgraph
+{
+        text-align: center;
+}
+
+.diagraph
+{
+        text-align: center;
+}
+
+.caption
+{
+	font-weight: bold;
+}
+
+div.zoom
+{
+	border: 1px solid #90A5CE;
+}
+
+dl.citelist {
+        margin-bottom:50px;
+}
+
+dl.citelist dt {
+        color:#334975;
+        float:left;
+        font-weight:bold;
+        margin-right:10px;
+        padding:5px;
+}
+
+dl.citelist dd {
+        margin:2px 0;
+        padding:5px 0;
+}
+
+div.toc {
+        padding: 14px 25px;
+        background-color: #F4F6FA;
+        border: 1px solid #D8DFEE;
+        border-radius: 7px 7px 7px 7px;
+        float: right;
+        height: auto;
+        margin: 0 8px 10px 10px;
+        width: 200px;
+}
+
+div.toc li {
+        background: url("bdwn.png") no-repeat scroll 0 5px transparent;
+        font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif;
+        margin-top: 5px;
+        padding-left: 10px;
+        padding-top: 2px;
+}
+
+div.toc h3 {
+        font: bold 12px/1.2 Arial,FreeSans,sans-serif;
+	color: #4665A2;
+        border-bottom: 0 none;
+        margin: 0;
+}
+
+div.toc ul {
+        list-style: none outside none;
+        border: medium none;
+        padding: 0px;
+}       
+
+div.toc li.level1 {
+        margin-left: 0px;
+}
+
+div.toc li.level2 {
+        margin-left: 15px;
+}
+
+div.toc li.level3 {
+        margin-left: 30px;
+}
+
+div.toc li.level4 {
+        margin-left: 45px;
+}
+
+.inherit_header {
+        font-weight: bold;
+        color: gray;
+        cursor: pointer;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+.inherit_header td {
+        padding: 6px 0px 2px 5px;
+}
+
+.inherit {
+        display: none;
+}
+
+tr.heading h2 {
+        margin-top: 12px;
+        margin-bottom: 4px;
+}
+
+/* tooltip related style info */
+
+.ttc {
+        position: absolute;
+        display: none;
+}
+
+#powerTip {
+	cursor: default;
+	white-space: nowrap;
+	background-color: white;
+	border: 1px solid gray;
+	border-radius: 4px 4px 4px 4px;
+	box-shadow: 1px 1px 7px gray;
+	display: none;
+	font-size: smaller;
+	max-width: 80%;
+	opacity: 0.9;
+	padding: 1ex 1em 1em;
+	position: absolute;
+	z-index: 2147483647;
+}
+
+#powerTip div.ttdoc {
+        color: grey;
+	font-style: italic;
+}
+
+#powerTip div.ttname a {
+        font-weight: bold;
+}
+
+#powerTip div.ttname {
+        font-weight: bold;
+}
+
+#powerTip div.ttdeci {
+        color: #006318;
+}
+
+#powerTip div {
+        margin: 0px;
+        padding: 0px;
+        font: 12px/16px Roboto,sans-serif;
+}
+
+#powerTip:before, #powerTip:after {
+	content: "";
+	position: absolute;
+	margin: 0px;
+}
+
+#powerTip.n:after,  #powerTip.n:before,
+#powerTip.s:after,  #powerTip.s:before,
+#powerTip.w:after,  #powerTip.w:before,
+#powerTip.e:after,  #powerTip.e:before,
+#powerTip.ne:after, #powerTip.ne:before,
+#powerTip.se:after, #powerTip.se:before,
+#powerTip.nw:after, #powerTip.nw:before,
+#powerTip.sw:after, #powerTip.sw:before {
+	border: solid transparent;
+	content: " ";
+	height: 0;
+	width: 0;
+	position: absolute;
+}
+
+#powerTip.n:after,  #powerTip.s:after,
+#powerTip.w:after,  #powerTip.e:after,
+#powerTip.nw:after, #powerTip.ne:after,
+#powerTip.sw:after, #powerTip.se:after {
+	border-color: rgba(255, 255, 255, 0);
+}
+
+#powerTip.n:before,  #powerTip.s:before,
+#powerTip.w:before,  #powerTip.e:before,
+#powerTip.nw:before, #powerTip.ne:before,
+#powerTip.sw:before, #powerTip.se:before {
+	border-color: rgba(128, 128, 128, 0);
+}
+
+#powerTip.n:after,  #powerTip.n:before,
+#powerTip.ne:after, #powerTip.ne:before,
+#powerTip.nw:after, #powerTip.nw:before {
+	top: 100%;
+}
+
+#powerTip.n:after, #powerTip.ne:after, #powerTip.nw:after {
+	border-top-color: #ffffff;
+	border-width: 10px;
+	margin: 0px -10px;
+}
+#powerTip.n:before {
+	border-top-color: #808080;
+	border-width: 11px;
+	margin: 0px -11px;
+}
+#powerTip.n:after, #powerTip.n:before {
+	left: 50%;
+}
+
+#powerTip.nw:after, #powerTip.nw:before {
+	right: 14px;
+}
+
+#powerTip.ne:after, #powerTip.ne:before {
+	left: 14px;
+}
+
+#powerTip.s:after,  #powerTip.s:before,
+#powerTip.se:after, #powerTip.se:before,
+#powerTip.sw:after, #powerTip.sw:before {
+	bottom: 100%;
+}
+
+#powerTip.s:after, #powerTip.se:after, #powerTip.sw:after {
+	border-bottom-color: #ffffff;
+	border-width: 10px;
+	margin: 0px -10px;
+}
+
+#powerTip.s:before, #powerTip.se:before, #powerTip.sw:before {
+	border-bottom-color: #808080;
+	border-width: 11px;
+	margin: 0px -11px;
+}
+
+#powerTip.s:after, #powerTip.s:before {
+	left: 50%;
+}
+
+#powerTip.sw:after, #powerTip.sw:before {
+	right: 14px;
+}
+
+#powerTip.se:after, #powerTip.se:before {
+	left: 14px;
+}
+
+#powerTip.e:after, #powerTip.e:before {
+	left: 100%;
+}
+#powerTip.e:after {
+	border-left-color: #ffffff;
+	border-width: 10px;
+	top: 50%;
+	margin-top: -10px;
+}
+#powerTip.e:before {
+	border-left-color: #808080;
+	border-width: 11px;
+	top: 50%;
+	margin-top: -11px;
+}
+
+#powerTip.w:after, #powerTip.w:before {
+	right: 100%;
+}
+#powerTip.w:after {
+	border-right-color: #ffffff;
+	border-width: 10px;
+	top: 50%;
+	margin-top: -10px;
+}
+#powerTip.w:before {
+	border-right-color: #808080;
+	border-width: 11px;
+	top: 50%;
+	margin-top: -11px;
+}
+
+@media print
+{
+  #top { display: none; }
+  #side-nav { display: none; }
+  #nav-path { display: none; }
+  body { overflow:visible; }
+  h1, h2, h3, h4, h5, h6 { page-break-after: avoid; }
+  .summary { display: none; }
+  .memitem { page-break-inside: avoid; }
+  #doc-content
+  {
+    margin-left:0 !important;
+    height:auto !important;
+    width:auto !important;
+    overflow:inherit;
+    display:inline;
+  }
+}
+
+/* @group Markdown */
+
+/*
+table.markdownTable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.markdownTable td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.markdownTableHead tr {
+}
+
+table.markdownTableBodyLeft td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+th.markdownTableHeadLeft th.markdownTableHeadRight th.markdownTableHeadCenter th.markdownTableHeadNone {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+th.markdownTableHeadLeft {
+	text-align: left
+}
+
+th.markdownTableHeadRight {
+	text-align: right
+}
+
+th.markdownTableHeadCenter {
+	text-align: center
+}
+*/
+
+table.markdownTable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.markdownTable td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.markdownTable tr {
+}
+
+th.markdownTableHeadLeft, th.markdownTableHeadRight, th.markdownTableHeadCenter, th.markdownTableHeadNone {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+th.markdownTableHeadLeft, td.markdownTableBodyLeft {
+	text-align: left
+}
+
+th.markdownTableHeadRight, td.markdownTableBodyRight {
+	text-align: right
+}
+
+th.markdownTableHeadCenter, td.markdownTableBodyCenter {
+	text-align: center
+}
+
+
+/* @end */
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/loadgen_integration_diagram.dia b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/loadgen_integration_diagram.dia
new file mode 100644
index 0000000..569089f
Binary files /dev/null and b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/loadgen_integration_diagram.dia differ
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/mlperf_icon.png b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/mlperf_icon.png
new file mode 100644
index 0000000..9532189
Binary files /dev/null and b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/mlperf_icon.png differ
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/docs/src/mlperf_logo_horizontal_color.svg b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/mlperf_logo_horizontal_color.svg
new file mode 100644
index 0000000..d932bda
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/docs/src/mlperf_logo_horizontal_color.svg
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 22.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg version="1.1" id="mlperf_x5F_logo_x5F_horizontal_x5F_color"
+	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 494 166"
+	 style="enable-background:new 0 0 494 166;" xml:space="preserve">
+<style type="text/css">
+	.st0{fill:#333333;}
+	.st1{fill:#1A6197;}
+	.st2{fill:#B4BFC1;}
+</style>
+<g>
+	<g>
+		<path class="st0" d="M242.3,58.3c0.9,0.9,1.4,1.9,1.4,3.2v43c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-2,1.3-3.3,1.3
+			c-1.4,0-2.5-0.4-3.4-1.3c-0.9-0.8-1.3-1.9-1.3-3.3V74.6l-12.2,18.7c-0.5,0.6-1.1,1.1-1.7,1.4c-0.6,0.4-1.3,0.5-2,0.5
+			c-0.6,0-1.3-0.1-1.9-0.5c-0.6-0.4-1.2-0.9-1.7-1.4l-12.2-19.4v30.6c0,1.3-0.4,2.4-1.2,3.3c-0.8,0.8-1.8,1.3-3.1,1.3
+			c-1.2,0-2.3-0.4-3-1.3c-0.8-0.9-1.2-1.9-1.2-3.2v-43c0-1.2,0.5-2.3,1.4-3.2c0.9-0.9,2-1.3,3.3-1.3c0.7,0,1.5,0.2,2.2,0.6
+			c0.7,0.4,1.3,0.9,1.7,1.4l16.2,24.9L235.4,59c0.4-0.6,0.9-1.1,1.6-1.4c0.7-0.4,1.3-0.6,2-0.6C240.3,57,241.3,57.4,242.3,58.3z"/>
+		<path class="st0" d="M286.7,101.5c0.9,0.8,1.3,1.9,1.3,3.2c0,1.2-0.4,2.3-1.3,3.1c-0.9,0.8-1.9,1.2-3.2,1.2h-24.1
+			c-1.3,0-2.4-0.4-3.2-1.3c-0.9-0.9-1.3-1.9-1.3-3.2V61.5c0-1.3,0.4-2.4,1.3-3.2c0.9-0.9,2-1.3,3.4-1.3c1.2,0,2.3,0.4,3.2,1.3
+			c0.9,0.9,1.3,1.9,1.3,3.2v38.8h19.4C284.7,100.3,285.8,100.7,286.7,101.5z"/>
+		<path class="st0" d="M325.2,59.3c2.3,1.5,4.1,3.5,5.5,6.1c1.4,2.6,2.1,5.3,2.1,8.4c0,3-0.7,5.7-2.1,8.3c-1.4,2.6-3.2,4.6-5.5,6.1
+			c-2.3,1.5-4.7,2.3-7.2,2.3H305v14c0,1.3-0.4,2.4-1.2,3.3s-1.8,1.3-3.1,1.3c-1.2,0-2.3-0.4-3-1.3c-0.8-0.9-1.2-1.9-1.2-3.2V61.5
+			c0-1.3,0.4-2.4,1.3-3.2c0.9-0.9,1.9-1.3,3.2-1.3H318C320.5,57,322.9,57.8,325.2,59.3z M320.8,80.8c0.9-0.8,1.7-1.8,2.2-3.1
+			c0.6-1.3,0.9-2.6,0.9-4c0-1.4-0.3-2.7-0.9-4c-0.6-1.3-1.3-2.3-2.2-3c-0.9-0.8-1.8-1.2-2.8-1.2H305V82H318
+			C318.9,82,319.8,81.6,320.8,80.8z"/>
+		<path class="st0" d="M373.9,90.4c-0.9,0.7-1.9,1.1-3.1,1.1h-24.5c0.6,2.9,2,5.2,4.1,6.9c2.1,1.7,4.5,2.5,7.2,2.5
+			c2,0,3.6-0.2,4.7-0.6c1.1-0.4,2-0.8,2.7-1.2c0.7-0.4,1.1-0.7,1.4-0.9c0.9-0.4,1.7-0.7,2.5-0.7c1,0,1.9,0.4,2.7,1.1
+			c0.7,0.7,1.1,1.6,1.1,2.6c0,1.3-0.7,2.5-2.1,3.6c-1.4,1.1-3.2,2.1-5.6,2.9c-2.3,0.8-4.7,1.2-7,1.2c-4.2,0-7.8-0.9-10.9-2.6
+			c-3.1-1.7-5.5-4.1-7.2-7.2c-1.7-3-2.5-6.5-2.5-10.3c0-4.3,0.9-8,2.7-11.2c1.8-3.2,4.1-5.7,7-7.4c2.9-1.7,6-2.6,9.3-2.6
+			c3.3,0,6.3,0.9,9.2,2.7c2.9,1.8,5.2,4.2,6.9,7.2c1.7,3,2.6,6.3,2.6,9.8C375.2,88.7,374.8,89.6,373.9,90.4z M346.4,84.1h19.4v-0.5
+			c-0.2-2.1-1.2-3.9-3-5.3c-1.8-1.5-3.9-2.2-6.2-2.2C350.8,76,347.4,78.7,346.4,84.1z"/>
+		<path class="st0" d="M407.5,69.8c1,0.8,1.5,1.8,1.5,3c0,1.5-0.4,2.7-1.2,3.4c-0.8,0.8-1.7,1.1-2.8,1.1c-0.7,0-1.6-0.2-2.5-0.5
+			c-0.1,0-0.5-0.1-1-0.3c-0.5-0.1-1.1-0.2-1.7-0.2c-1.3,0-2.5,0.4-3.7,1.2c-1.2,0.8-2.2,2-2.9,3.6c-0.8,1.6-1.1,3.5-1.1,5.7v17.6
+			c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-1.9,1.3-3.2,1.3c-1.3,0-2.3-0.4-3.2-1.3c-0.8-0.9-1.3-1.9-1.3-3.2V73.8c0-1.3,0.4-2.4,1.3-3.2
+			c0.8-0.9,1.9-1.3,3.2-1.3c1.3,0,2.3,0.4,3.2,1.3c0.8,0.9,1.3,1.9,1.3,3.2v1c1.1-2,2.8-3.6,4.9-4.6c2.1-1.1,4.4-1.6,6.8-1.6
+			C405.2,68.6,406.5,69,407.5,69.8z"/>
+		<path class="st0" d="M432.3,63.3c-0.7,1-1,2-1,3.2V70h6.8c1.2,0,2.2,0.4,3,1.2c0.8,0.8,1.2,1.7,1.2,2.9c0,1.2-0.4,2.2-1.2,2.9
+			c-0.8,0.8-1.8,1.2-3,1.2h-6.8v26.3c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-1.9,1.3-3.2,1.3s-2.4-0.4-3.2-1.3c-0.8-0.9-1.3-1.9-1.3-3.2
+			V78.2h-4c-1.2,0-2.2-0.4-3-1.2c-0.8-0.8-1.2-1.7-1.2-2.9c0-1.2,0.4-2.2,1.2-2.9c0.8-0.8,1.8-1.2,3-1.2h4v-3.4
+			c0-3.7,1.3-6.7,3.8-9.1c2.6-2.3,6.2-3.5,11-3.5c1.8,0,3.4,0.4,4.8,1.1c1.4,0.7,2,1.9,2,3.4c0,1.2-0.3,2.2-1,2.9
+			c-0.7,0.7-1.5,1.1-2.5,1.1c-0.2,0-0.5,0-0.8-0.1c-0.3,0-0.6-0.1-0.9-0.1c-1.2-0.3-2.2-0.4-3-0.4C434.1,61.9,433,62.4,432.3,63.3z"
+			/>
+	</g>
+	<g>
+		<path class="st1" d="M138.7,71.3l-24.2,39.3l0,0c-1.9,3.2-5.4,5.3-9.4,5.3c-6.1,0-11-4.9-11-11c0-4,2.1-7.5,5.3-9.4l0,0
+			L138.7,71.3"/>
+		<g>
+			<path class="st2" d="M136.1,67.1l-11.7,7.2c-7.3-6.6-17-10.6-27.6-10.6C74,63.8,55.5,82.2,55.5,105c0,2.7,0.3,5.3,0.7,7.8l0,0
+				c0,0.1,0,0.3,0,0.4c0,1.6-1.3,2.8-2.8,2.8c-1.4,0-2.6-1-2.8-2.4l0,0c-0.4-2.8-0.7-5.7-0.7-8.6c0-30.4,24.6-55,55-55
+				c14,0,26.7,5.2,36.4,13.8L136.1,67.1z M146.2,68.6l-3.3,5.3l-8.6,14c2.4,5.2,3.7,11,3.7,17.1c0,0,0,0,0,0c0,6.1,4.9,11,11,11
+				c6.1,0,11-4.9,11-11h0C160,91,154.8,78.3,146.2,68.6z"/>
+		</g>
+	</g>
+</g>
+</svg>
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/generated/version_generated.cc b/benchmarks/rnnt/ootb/inference/loadgen/generated/version_generated.cc
new file mode 100644
index 0000000..56c5603
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/generated/version_generated.cc
@@ -0,0 +1,93 @@
+// DO NOT EDIT: Autogenerated by version_generator.py.
+
+#include <string>
+
+namespace mlperf {
+
+const std::string& LoadgenVersion() {
+  static const std::string str = "1.1";
+  return str;
+}
+
+const std::string& LoadgenBuildDateLocal() {
+  static const std::string str = "2021-10-19T21:48:17.599620";
+  return str;
+}
+
+const std::string& LoadgenBuildDateUtc() {
+  static const std::string str = "2021-10-19T21:48:17.599647";
+  return str;
+}
+
+const std::string& LoadgenGitRevision() {
+  static const std::string str = "b41cf1d6b0";
+  return str;
+}
+
+const std::string& LoadgenGitCommitDate() {
+  static const std::string str = "2021-10-05T08:52:18-07:00";
+  return str;
+}
+
+const std::string& LoadgenGitStatus() {
+  static const std::string str = R"LGVG_RSLD( M ../speech_recognition/rnnt/run.sh)LGVG_RSLD";
+  return str;
+}
+
+const std::string& LoadgenGitLog() {
+  static const std::string str = R"LGVG_RSLD(b41cf1d6b05e70057c62c8da5f65c9a110819ffb Add MLCube support for translation benchmark (#1022)
+077f823ece09e37d9d40540d8acd504bf138e880 Improve memory usage of RNN-T encoder StackTime module (#1015)
+5128211f4ced39a50fb4c66d9b436ffacf7f4704 Dockerfile update (#1020)
+215c057fc6690a47f3f66c72c076a8f73d66cb12 Update backend_tf.py (#1019)
+ca250e4d37b3171a3c59b2d6b45d27884bcd8a24 Add TensorFlow intra and inter op threading to BERT tf_SUT (#1008)
+a77ac37d07145d9f3123465a8fd18f9ebbde5d6a use size_t in ArgValueTransform for vector (#1004)
+d29092298f5075b234eee21352a85c094a636e71 Format numbers (#999)
+f3967a6cbdde8581520ae7f483db83e97721c5c1 Update python_api.cc (#991)
+b1452de454bcf99c9dde1e50670887d0557b1841 Revert change to 0.5 seeds, add settings for 1.1 (#988)
+90243c8bd745b808e5f0c2bd4593491a623cd595 Update checker with 1.1 RNG seeds (#986)
+b11a1eae042b1637b5f199bb0e4e477fbbc1daaa Update with 1.1 seeds (#985)
+e27236f0849eb8cd7f707c53b6422c2fcfd2d0b7 Added pytorch model for ResNet50 pytorch inferene and updated resnet50-pytorch profile (#982)
+79919f17d609472a6d27e2396200e90e8cc90e4e slight modifications (#981)
+df372335d124dcb0f4d4ba18a022206d449dcb15 update CONTRIBUTING
+672d8b32169ef7551dfd77bbe84caf9fe80616dc update LICENSE
+889f6080b69759342444f1aa97b2cb571fddc5b5  @dkorchevgithub adding lines suggested by @EtoDemerzel0427 to fix "good" count  (#965))LGVG_RSLD";
+  return str;
+}
+
+const std::string& LoadgenSha1OfFiles() {
+  static const std::string str = R"LGVG_RSLD(012aad77e5206c89d50718c46c119d1f3cb056b2 /.clang-format
+52f09b641b5693821a85daf5d8277b8e00ced162 /CMakeLists.txt
+5748d70fda253921fcbbc8b71ebdc4219ad4549d /README.md
+7c578c34d97c55ebecb5dd88e5d56a7ac299d197 /README_BUILD.md
+4e3da1409225b92d93e86f8379aecd043eae6e54 /README_FAQ.md
+9816f5b050ec2c3386824f1c07238b9fb7bde1e3 /bindings/c_api.cc
+fce6628f717113ed402dc4f49396593694f29989 /bindings/c_api.h
+9080d1779ecb10b8ef509dd9bf42d28f0bd8611b /bindings/python_api.cc
+07d85ea5fefade8d3d66ca832b783bf90e5bce7d /demos/py_demo_multi_stream.py
+2f941e949770a2ea9ad6d6eea2f33fc19150edcd /demos/py_demo_multi_stream_free.py
+73c60cf12cfb69619249e637f9bcad18a47cda6c /demos/py_demo_offline.py
+02a6ddbe8b51bde2cfed32abeccdb7fdd9cfe4e7 /demos/py_demo_server.py
+3e7d786882099ac784bf878f1cde9dd3db2f6d9f /demos/py_demo_single_stream.py
+51609e637eb47121bf34af468e4e8d06cb37e667 /issue_query_controller.cc
+0365e32a3b80091113f871e0de1a80c6c66cbe11 /issue_query_controller.h
+3c48f3b9299047a6000902759bc2f18ffcdcfa30 /loadgen.cc
+1d985e62e8f8dd5523f8d3a1ed404529afcd221e /loadgen.h
+47f748307536f80cfc606947b440dd732afc2637 /loadgen_integration_diagram.svg
+48658fd3bc872f178fca86ea8ed2e3025f195d71 /logging.cc
+384e30e0ef566a1d03d925726d5a1dda08ea8b22 /logging.h
+a879f127c4bb64d131056942734ea1a6ba9c3033 /query_sample.h
+61feb478b15668f48245c5083d559dee5de1c082 /query_sample_library.h
+9fced62cc3af5e3b31b8e13f38cf35ba0564fe3c /setup.py
+744482c84336420adbbe31ef8c2b39612c6ca3f8 /system_under_test.h
+34eec1f8fb4a2db273b715686580f19cffa2f3a1 /test_settings.h
+b45f0d4d28d4c49c33bf2575eff613ad8ca99154 /test_settings_internal.cc
+81603212f824ceefbf2d0fe88d2ef94c67d28f6e /test_settings_internal.h
+efde030897afdc372f974901eef2ed59bcedcbeb /utils.cc
+40775e32d619ea6356826ae5ea4174c7911f6894 /utils.h
+cbec2a5f98f9786c8c3d8b06b3d12df0b6550fa0 /version.cc
+9d574baa64424e9c708fcfedd3dbb0b518a65fcc /version.h
+fd7ffb94e1e84161264c44e175e551e5fffb6a09 /version_generator.py)LGVG_RSLD";
+  return str;
+}
+
+}  // namespace mlperf
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/issue_query_controller.cc b/benchmarks/rnnt/ootb/inference/loadgen/issue_query_controller.cc
new file mode 100644
index 0000000..2be942c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/issue_query_controller.cc
@@ -0,0 +1,619 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Implements IssueQueryController and other helper classes for
+/// query issuing.
+
+#include "issue_query_controller.h"
+
+#include <sstream>
+
+namespace mlperf {
+
+void RegisterIssueQueryThread() {
+  loadgen::IssueQueryController::GetInstance().RegisterThread();
+}
+
+/// \brief Loadgen implementation details.
+namespace loadgen {
+
+QueryMetadata::QueryMetadata(
+    const std::vector<QuerySampleIndex>& query_sample_indices,
+    std::chrono::nanoseconds scheduled_delta,
+    ResponseDelegate* response_delegate, SequenceGen* sequence_gen)
+    : scheduled_delta(scheduled_delta),
+      response_delegate(response_delegate),
+      sequence_id(sequence_gen->NextQueryId()),
+      wait_count_(query_sample_indices.size()) {
+  samples_.reserve(query_sample_indices.size());
+  for (QuerySampleIndex qsi : query_sample_indices) {
+    samples_.push_back({this, sequence_gen->NextSampleId(), qsi,
+                        sequence_gen->NextAccLogRng()});
+  }
+  query_to_send.reserve(query_sample_indices.size());
+  for (auto& s : samples_) {
+    query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
+  }
+}
+
+QueryMetadata::QueryMetadata(QueryMetadata&& src)
+    : query_to_send(std::move(src.query_to_send)),
+      scheduled_delta(src.scheduled_delta),
+      response_delegate(src.response_delegate),
+      sequence_id(src.sequence_id),
+      wait_count_(src.samples_.size()),
+      samples_(std::move(src.samples_)) {
+  // The move constructor should only be called while generating a
+  // vector of QueryMetadata, before it's been used.
+  // Assert that wait_count_ is in its initial state.
+  assert(src.wait_count_.load() == samples_.size());
+  // Update the "parent" of each sample to be this query; the old query
+  // address will no longer be valid.
+  // TODO: Only set up the sample parenting once after all the queries have
+  //       been created, rather than re-parenting on move here.
+  for (size_t i = 0; i < samples_.size(); i++) {
+    SampleMetadata* s = &samples_[i];
+    s->query_metadata = this;
+    query_to_send[i].id = reinterpret_cast<ResponseId>(s);
+  }
+}
+
+void QueryMetadata::NotifyOneSampleCompleted(PerfClock::time_point timestamp) {
+  size_t old_count = wait_count_.fetch_sub(1, std::memory_order_relaxed);
+  if (old_count == 1) {
+    all_samples_done_time = timestamp;
+    all_samples_done_.set_value();
+    response_delegate->QueryComplete();
+  }
+}
+
+void QueryMetadata::WaitForAllSamplesCompleted() {
+  all_samples_done_.get_future().wait();
+}
+
+PerfClock::time_point QueryMetadata::WaitForAllSamplesCompletedWithTimestamp() {
+  all_samples_done_.get_future().wait();
+  return all_samples_done_time;
+}
+
+// When server_coalesce_queries is set to true in Server scenario, we
+// sometimes coalesce multiple queries into one query. This is done by moving
+// the other query's sample into current query, while maintaining their
+// original scheduled_time.
+void QueryMetadata::CoalesceQueries(QueryMetadata* queries, size_t first,
+                                    size_t last, size_t stride) {
+  // Copy sample data over to current query, boldly assuming that each query
+  // only has one sample.
+  query_to_send.reserve((last - first) / stride +
+                        2);  // Extra one for the current query.
+  for (size_t i = first; i <= last; i += stride) {
+    auto& q = queries[i];
+    auto& s = q.samples_[0];
+    query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
+    q.scheduled_time = scheduled_time + q.scheduled_delta - scheduled_delta;
+    q.issued_start_time = issued_start_time;
+  }
+}
+
+void QueryMetadata::Decoalesce() { query_to_send.resize(1); }
+
+/// \brief A base template that should never be used since each scenario has
+/// its own specialization.
+template <TestScenario scenario>
+struct QueryScheduler {
+  static_assert(scenario != scenario, "Unhandled TestScenario");
+};
+
+/// \brief Schedules queries for issuance in the single stream scenario.
+template <>
+struct QueryScheduler<TestScenario::SingleStream> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    auto tracer = MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
+    if (prev_query != nullptr) {
+      prev_query->WaitForAllSamplesCompleted();
+    }
+    prev_query = next_query;
+
+    auto now = PerfClock::now();
+    next_query->scheduled_time = now;
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  QueryMetadata* prev_query = nullptr;
+};
+
+/// \brief Schedules queries for issuance in the multi stream scenario.
+template <>
+struct QueryScheduler<TestScenario::MultiStream> {
+  QueryScheduler(const TestSettingsInternal& settings,
+                 const PerfClock::time_point start)
+      : qps(settings.target_qps),
+        max_async_queries(settings.max_async_queries),
+        start_time(start) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    {
+      prev_queries.push(next_query);
+      auto tracer =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
+      if (prev_queries.size() > max_async_queries) {
+        prev_queries.front()->WaitForAllSamplesCompleted();
+        prev_queries.pop();
+      }
+    }
+
+    {
+      auto tracer =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Scheduling"); });
+      // TODO(brianderson): Skip ticks based on the query complete time,
+      //     before the query synchronization + notification thread hop,
+      //     rather than after.
+      PerfClock::time_point now = PerfClock::now();
+      auto i_period_old = i_period;
+      PerfClock::time_point tick_time;
+      do {
+        i_period++;
+        tick_time =
+            start_time + SecondsToDuration<PerfClock::duration>(i_period / qps);
+        Log([tick_time](AsyncLog& log) {
+          log.TraceAsyncInstant("QueryInterval", 0, tick_time);
+        });
+      } while (tick_time < now);
+      next_query->scheduled_intervals = i_period - i_period_old;
+      next_query->scheduled_time = tick_time;
+      std::this_thread::sleep_until(tick_time);
+    }
+
+    auto now = PerfClock::now();
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  size_t i_period = 0;
+  double qps;
+  const size_t max_async_queries;
+  PerfClock::time_point start_time;
+  std::queue<QueryMetadata*> prev_queries;
+};
+
+/// \brief Schedules queries for issuance in the single stream free scenario.
+template <>
+struct QueryScheduler<TestScenario::MultiStreamFree> {
+  QueryScheduler(const TestSettingsInternal& settings,
+                 const PerfClock::time_point /*start*/)
+      : max_async_queries(settings.max_async_queries) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    bool schedule_time_needed = true;
+    {
+      prev_queries.push(next_query);
+      auto tracer =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
+      if (prev_queries.size() > max_async_queries) {
+        next_query->scheduled_time =
+            prev_queries.front()->WaitForAllSamplesCompletedWithTimestamp();
+        schedule_time_needed = false;
+        prev_queries.pop();
+      }
+    }
+
+    auto now = PerfClock::now();
+    if (schedule_time_needed) {
+      next_query->scheduled_time = now;
+    }
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  const size_t max_async_queries;
+  std::queue<QueryMetadata*> prev_queries;
+};
+
+/// \brief Schedules queries for issuance in the server scenario.
+template <>
+struct QueryScheduler<TestScenario::Server> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point start)
+      : start(start) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    auto tracer =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("Scheduling"); });
+
+    auto scheduled_time = start + next_query->scheduled_delta;
+    next_query->scheduled_time = scheduled_time;
+
+    auto now = PerfClock::now();
+    if (now < scheduled_time) {
+      std::this_thread::sleep_until(scheduled_time);
+      now = PerfClock::now();
+    }
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  const PerfClock::time_point start;
+};
+
+/// \brief Schedules queries for issuance in the offline scenario.
+template <>
+struct QueryScheduler<TestScenario::Offline> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point start)
+      : start(start) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    next_query->scheduled_time = start;
+    auto now = PerfClock::now();
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  const PerfClock::time_point start;
+};
+
+IssueQueryController& IssueQueryController::GetInstance() {
+  // The singleton.
+  static IssueQueryController instance;
+  return instance;
+}
+
+void IssueQueryController::RegisterThread() {
+  // Push this thread to thread queue.
+  auto thread_id = std::this_thread::get_id();
+  size_t thread_idx{0};
+  {
+    std::lock_guard<std::mutex> lock(mtx);
+    thread_idx = thread_ids.size();
+    thread_ids.emplace_back(thread_id);
+  }
+
+  LogDetail([thread_id, thread_idx](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Registered IssueQueryThread[" << thread_idx
+       << "]. thread ID : " << std::hash<std::thread::id>()(thread_id);
+    MLPERF_LOG(detail, "generic_message", ss.str());
+#else
+    detail("Registered IssueQueryThread[" + std::to_string(thread_idx) +
+               "]. thread ID : ",
+           std::to_string(std::hash<std::thread::id>()(thread_id)));
+#endif
+  });
+
+  // Start test.
+  while (true) {
+    // Wait until the main thread signals a start or the end.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() { return issuing || end_test; });
+      // The test has ended.
+      if (end_test) {
+        break;
+      }
+    }
+
+    // Start issuing queries.
+    if (thread_idx <= num_threads) {
+      IssueQueriesInternal<TestScenario::Server, true>(num_threads, thread_idx);
+      {
+        std::lock_guard<std::mutex> lock(mtx);
+        thread_complete[thread_idx] = true;
+      }
+      cond_var.notify_all();
+    }
+
+    // Wait until all issue threads complete.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() { return !issuing; });
+    }
+  }
+}
+
+void IssueQueryController::SetNumThreads(size_t n) {
+  // Try waiting for IssueQueryThreads() to registered themselves.
+  std::unique_lock<std::mutex> lock(mtx);
+  const std::chrono::seconds timeout(10);
+  num_threads = n;
+  cond_var.wait_for(lock, timeout,
+                    [this]() { return thread_ids.size() >= num_threads; });
+  // If the number of registered threads do not match the settings, report an
+  // error.
+  if (num_threads != thread_ids.size()) {
+    LogDetail([this](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "Mismatch between settings and number of registered "
+         << "IssueQueryThreads! settings.server_num_issue_query_threads = "
+         << num_threads << " but " << thread_ids.size()
+         << " threads registered.";
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error(
+          "Mismatch between settings and number of registered ",
+          "IssueQueryThreads! settings.server_num_issue_query_threads = ",
+          num_threads, " but ", thread_ids.size(), " threads registered.");
+#endif
+    });
+  }
+}
+
+template <TestScenario scenario>
+void IssueQueryController::StartIssueQueries(IssueQueryState* s) {
+  // Get the state.
+  state = s;
+  state->start_for_power = std::chrono::system_clock::now();
+  state->start_time = PerfClock::now();
+
+  if (scenario != TestScenario::Server || num_threads == 0) {
+    // Usually, we just use the same thread to issue queries.
+    IssueQueriesInternal<scenario, false>(1, 0);
+  } else {
+    // If server_num_issue_query_threads is non-zero, issue queries on the
+    // registered threads.
+    // Tell all threads to start issuing queries.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      issuing = true;
+      thread_complete.assign(num_threads, false);
+    }
+    cond_var.notify_all();
+    // Wait until all issue threads complete.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() {
+        return std::all_of(thread_complete.begin(), thread_complete.end(),
+                           [](bool in) { return in; });
+      });
+      issuing = false;
+    }
+    cond_var.notify_all();
+  }
+}
+
+template void IssueQueryController::StartIssueQueries<
+    TestScenario::MultiStream>(IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<
+    TestScenario::MultiStreamFree>(IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<TestScenario::Offline>(
+    IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<TestScenario::Server>(
+    IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<
+    TestScenario::SingleStream>(IssueQueryState* s);
+
+void IssueQueryController::EndThreads() {
+  // Tell all the issue threads to end.
+  {
+    std::lock_guard<std::mutex> lock(mtx);
+    end_test = true;
+  }
+  cond_var.notify_all();
+}
+
+template <TestScenario scenario, bool multi_thread>
+void IssueQueryController::IssueQueriesInternal(size_t query_stride,
+                                                size_t thread_idx) {
+  // Get all the needed information.
+  auto sut = state->sut;
+  auto& queries = *state->queries;
+  auto& response_logger = *state->response_delegate;
+
+  // Some book-keeping about the number of queries issued.
+  size_t queries_issued = 0;
+  size_t queries_issued_per_iter = 0;
+  size_t queries_count = queries.size();
+
+  // Calculate the min/max queries per issue thread.
+  const auto& settings = *state->settings;
+  const size_t min_query_count = settings.min_query_count;
+  const size_t min_query_count_for_thread =
+      (thread_idx < (min_query_count % query_stride))
+          ? (min_query_count / query_stride + 1)
+          : (min_query_count / query_stride);
+  const size_t max_query_count = settings.max_query_count;
+  const size_t max_query_count_for_thread =
+      (thread_idx < (max_query_count % query_stride))
+          ? (max_query_count / query_stride + 1)
+          : (max_query_count / query_stride);
+
+  // Create query scheduler.
+  const auto start = state->start_time;
+  QueryScheduler<scenario> query_scheduler(settings, start);
+  auto last_now = start;
+
+  // We can never run out of generated queries in the server scenario,
+  // since the duration depends on the scheduled query time and not
+  // the actual issue time.
+  bool ran_out_of_generated_queries = scenario != TestScenario::Server;
+  // This is equal to the sum of numbers of samples issued.
+  size_t expected_latencies = 0;
+
+  for (size_t queries_idx = thread_idx; queries_idx < queries_count;
+       queries_idx += query_stride) {
+    queries_issued_per_iter = 0;
+    auto& query = queries[queries_idx];
+    auto tracer1 =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("SampleLoop"); });
+    last_now = query_scheduler.Wait(&query);
+
+    // If in Server scenario and server_coalesce_queries is enabled, multiple
+    // queries are coalesed into one big query if the current time has already
+    // passed the scheduled time of multiple queries.
+    if (scenario == TestScenario::Server &&
+        settings.requested.server_coalesce_queries) {
+      auto current_query_idx = queries_idx;
+      for (; queries_idx + query_stride < queries_count;
+           queries_idx += query_stride) {
+        auto next_scheduled_time =
+            start + queries[queries_idx + query_stride].scheduled_delta;
+        // If current time hasn't reached the next query's scheduled time yet,
+        // don't include next query.
+        if (last_now < next_scheduled_time) {
+          break;
+        }
+        queries_issued_per_iter++;
+      }
+      if (queries_idx > current_query_idx) {
+        // Coalesced all the pass due queries.
+        query.CoalesceQueries(queries.data(), current_query_idx + query_stride,
+                              queries_idx, query_stride);
+      }
+    }
+
+    // Issue the query to the SUT.
+    {
+      auto tracer3 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("IssueQuery"); });
+      sut->IssueQuery(query.query_to_send);
+    }
+
+    // Increment the counter.
+    expected_latencies += query.query_to_send.size();
+    queries_issued_per_iter++;
+    queries_issued += queries_issued_per_iter;
+
+    if (scenario == TestScenario::Server &&
+        settings.requested.server_coalesce_queries) {
+      // Set the query back to its clean state.
+      query.Decoalesce();
+    }
+
+    if (state->mode == TestMode::AccuracyOnly) {
+      // TODO: Rate limit in accuracy mode so accuracy mode works even
+      //       if the expected/target performance is way off.
+      continue;
+    }
+
+    auto duration = (last_now - start);
+    if (scenario == TestScenario::Server) {
+      if (settings.max_async_queries != 0) {
+        // Checks if there are too many outstanding queries.
+        size_t queries_issued_total{0};
+        if (multi_thread) {
+          // To check actual number of async queries in multi-thread case,
+          // we would have to combine the number of queries_issued from all
+          // issue threads.
+          {
+            std::lock_guard<std::mutex> lock(state->mtx);
+            state->queries_issued += queries_issued_per_iter;
+            queries_issued_total = state->queries_issued;
+          }
+        } else {
+          queries_issued_total = queries_issued;
+        }
+        size_t queries_outstanding =
+            queries_issued_total -
+            response_logger.queries_completed.load(std::memory_order_relaxed);
+        if (queries_outstanding > settings.max_async_queries) {
+          LogDetail([thread_idx, queries_issued_total,
+                     queries_outstanding](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+            std::stringstream ss;
+            ss << "IssueQueryThread " << thread_idx
+               << " Ending early: Too many outstanding queries."
+               << " issued " << queries_issued_total << " outstanding "
+               << queries_outstanding;
+            MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+            detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                         " Ending early: Too many outstanding queries.",
+                         "issued", std::to_string(queries_issued_total),
+                         "outstanding", std::to_string(queries_outstanding));
+#endif
+          });
+          break;
+        }
+      }
+    } else {
+      // Checks if we end normally.
+      if (queries_issued >= min_query_count_for_thread &&
+          duration >= settings.target_duration) {
+        LogDetail([thread_idx](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+          MLPERF_LOG(
+              detail, "generic_message",
+              "Ending naturally: Minimum query count and test duration met.");
+#else
+          detail(
+              " Ending naturally: Minimum query count and test duration met.");
+#endif
+        });
+        ran_out_of_generated_queries = false;
+        break;
+      }
+    }
+
+    // Checks if we have exceeded max_query_count for this thread.
+    if (settings.max_query_count != 0 &&
+        queries_issued >= max_query_count_for_thread) {
+      LogDetail([thread_idx, queries_issued](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        std::stringstream ss;
+        ss << "IssueQueryThread " << thread_idx
+           << " Ending early: Max query count reached."
+           << " query_count " << queries_issued;
+        MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+        detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                     " Ending early: Max query count reached.", "query_count",
+                     std::to_string(queries_issued));
+#endif
+      });
+      ran_out_of_generated_queries = false;
+      break;
+    }
+
+    // Checks if we have exceeded max_duration.
+    if (settings.max_duration.count() != 0 &&
+        duration > settings.max_duration) {
+      LogDetail([thread_idx, duration](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        std::stringstream ss;
+        ss << "IssueQueryThread " << thread_idx
+           << " Ending early: Max test duration reached."
+           << " duration_ns " << duration.count();
+        MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+        detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                     " Ending early: Max test duration reached.", "duration_ns",
+                     std::to_string(duration.count()));
+#endif
+      });
+      ran_out_of_generated_queries = false;
+      break;
+    }
+  }
+
+  // Combine the issuing statistics from multiple issue threads.
+  {
+    std::lock_guard<std::mutex> lock(state->mtx);
+    state->ran_out_of_generated_queries |= ran_out_of_generated_queries;
+    // In Server scenario and when max_async_queries != 0, we would have set
+    // state->queries_issued when we check max_async_queries in the loop.
+    if (!(scenario == TestScenario::Server && settings.max_async_queries != 0 &&
+          multi_thread)) {
+      state->queries_issued += queries_issued;
+    }
+    state->expected_latencies += expected_latencies;
+  }
+}
+
+}  // namespace loadgen
+
+}  // namespace mlperf
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/issue_query_controller.h b/benchmarks/rnnt/ootb/inference/loadgen/issue_query_controller.h
new file mode 100644
index 0000000..e5cf8fd
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/issue_query_controller.h
@@ -0,0 +1,211 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Declare IssueQueryController and other helper classes for
+/// query issuing.
+
+#ifndef MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
+#define MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
+
+#include "loadgen.h"
+#include "logging.h"
+#include "query_sample.h"
+#include "system_under_test.h"
+#include "test_settings_internal.h"
+#include "utils.h"
+
+#include <stdint.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+#include <vector>
+
+namespace mlperf {
+
+namespace loadgen {
+
+struct SampleMetadata;
+class QueryMetadata;
+
+/// \brief Every query and sample within a call to StartTest gets a unique
+/// sequence id for easy cross reference, and a random number which is used to
+/// determine accuracy logging when it is enabled.
+struct SequenceGen {
+  uint64_t NextQueryId() { return query_id++; }
+  uint64_t NextSampleId() { return sample_id++; }
+  uint64_t CurrentSampleId() { return sample_id; }
+  double NextAccLogRng() { return accuracy_log_dist(accuracy_log_rng); }
+  void InitAccLogRng(uint64_t accuracy_log_rng_seed) {
+    accuracy_log_rng = std::mt19937(accuracy_log_rng_seed);
+  }
+
+ private:
+  uint64_t query_id = 0;
+  uint64_t sample_id = 0;
+  std::mt19937 accuracy_log_rng;
+  std::uniform_real_distribution<double> accuracy_log_dist =
+      std::uniform_real_distribution<double>(0, 1);
+};
+
+/// \brief An interface for a particular scenario + mode to implement for
+/// extended hanlding of sample completion.
+struct ResponseDelegate {
+  virtual ~ResponseDelegate() = default;
+  virtual void SampleComplete(SampleMetadata*, QuerySampleResponse*,
+                              PerfClock::time_point, const ResponseCallback&) = 0;
+  virtual void QueryComplete() = 0;
+  std::atomic<size_t> queries_completed{0};
+};
+
+/// \brief Used by the loadgen to coordinate response data and completion.
+struct SampleMetadata {
+  QueryMetadata* query_metadata;
+  uint64_t sequence_id;
+  QuerySampleIndex sample_index;
+  double accuracy_log_val;
+};
+
+/// \brief Maintains data and timing info for a query and all its samples.
+class QueryMetadata {
+ public:
+  QueryMetadata(const std::vector<QuerySampleIndex>& query_sample_indices,
+                std::chrono::nanoseconds scheduled_delta,
+                ResponseDelegate* response_delegate, SequenceGen* sequence_gen);
+  QueryMetadata(QueryMetadata&& src);
+
+  void NotifyOneSampleCompleted(PerfClock::time_point timestamp);
+
+  void WaitForAllSamplesCompleted();
+
+  PerfClock::time_point WaitForAllSamplesCompletedWithTimestamp();
+
+  /// \brief Coalesce multiple queries into one query.
+  /// When server_coalesce_queries is set to true in Server scenario, we
+  /// sometimes coalesce multiple queries into one query. This is done by moving
+  /// the other query's sample into current query, while maintaining their
+  /// original scheduled_time.
+  void CoalesceQueries(QueryMetadata* queries, size_t first, size_t last,
+                       size_t stride);
+
+  /// \brief Set a coalesced query back to its original state.
+  void Decoalesce();
+
+ public:
+  std::vector<QuerySample> query_to_send;
+  const std::chrono::nanoseconds scheduled_delta;
+  ResponseDelegate* const response_delegate;
+  const uint64_t sequence_id;
+
+  // Performance information.
+
+  size_t scheduled_intervals = 0;  // Number of intervals between queries, as
+                                   // actually scheduled during the run.
+                                   // For the multi-stream scenario only.
+  PerfClock::time_point scheduled_time;
+  PerfClock::time_point issued_start_time;
+  PerfClock::time_point all_samples_done_time;
+
+ private:
+  std::atomic<size_t> wait_count_;
+  std::promise<void> all_samples_done_;
+  std::vector<SampleMetadata> samples_;
+};
+
+/// \brief A state object for communications between the controller and its
+/// caller.
+struct IssueQueryState {
+  // Information from caller to controller.
+  SystemUnderTest* sut;
+  std::vector<QueryMetadata>* queries;
+  ResponseDelegate* response_delegate;
+  const TestSettingsInternal* settings;
+  TestMode mode;
+  // Information from controller to caller.
+  std::chrono::system_clock::time_point start_for_power;
+  PerfClock::time_point start_time;
+  bool ran_out_of_generated_queries;
+  size_t queries_issued;
+  size_t expected_latencies;
+  // The lock to modify this state (in multi-thread case).
+  std::mutex mtx;
+};
+
+/// \brief Controls the query issuing part.
+/// This controller handles both the cases if the user registers or does not
+/// register IssueQueryThreads. It is implemented as a singleton, and is NOT
+/// thread-safe (i.e. users should not call StartTest() on multiple threads).
+/// It is thread-safe with regard to IssueQueryThreads.
+class IssueQueryController {
+ public:
+  /// \brief Get the controller instance singleton.
+  static IssueQueryController& GetInstance();
+
+  /// \brief Don't allow copy. This is a singleton.
+  IssueQueryController(IssueQueryController const&) = delete;
+  void operator=(IssueQueryController const&) = delete;
+
+  /// \brief Register an IssueQueryThread.
+  /// It is blocking until the entire test ends.
+  void RegisterThread();
+
+  /// \brief Set number of IssueQueryThreads and wait for thread registration.
+  /// If for any reason the number of registered threads do not match the
+  /// specified number, it prints out an error.
+  void SetNumThreads(size_t n);
+
+  /// \brief Kick off the query issuing.
+  /// The query issuing will be done on the current thread if there is no
+  /// registered IssueQueryThreads or if it is not in Server scenario.
+  template <TestScenario scenario>
+  void StartIssueQueries(IssueQueryState* s);
+
+  /// \brief Notify the IssueQueryThreads to end.
+  void EndThreads();
+
+ private:
+  /// \brief Hide constructor. This is a singleton.
+  IssueQueryController() {}
+
+  /// \brief The internal helper which actually issues queries.
+  /// This should be called by the thread(s) which issues queries.
+  template <TestScenario scenario, bool multi_thread>
+  void IssueQueriesInternal(size_t query_stride, size_t thread_idx);
+
+  /// \brief The issue query state.
+  IssueQueryState* state;
+  /// \brief Locks for communications across IssueQueryThreads and the main
+  /// thread.
+  std::mutex mtx;
+  std::condition_variable cond_var;
+  /// \brief Thread ids of the registered IssueQueryThreads.
+  std::vector<std::thread::id> thread_ids;
+  size_t num_threads{0};
+  /// \brief Whether the threads should be actively issuing queries.
+  bool issuing{false};
+  /// \brief Flags for each IssueQueryThread to mark that it is done.
+  std::vector<bool> thread_complete;
+  /// \brief Whether the threads can end now.
+  bool end_test{false};
+};
+
+}  // namespace loadgen
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/loadgen.cc b/benchmarks/rnnt/ootb/inference/loadgen/loadgen.cc
new file mode 100644
index 0000000..f01d62c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/loadgen.cc
@@ -0,0 +1,1644 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "loadgen.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <sstream>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <queue>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "issue_query_controller.h"
+#include "logging.h"
+#include "query_sample.h"
+#include "query_sample_library.h"
+#include "system_under_test.h"
+#include "test_settings.h"
+#include "test_settings_internal.h"
+#include "utils.h"
+#include "version.h"
+
+namespace mlperf {
+
+/// \brief Loadgen implementation details.
+namespace loadgen {
+
+/// \brief A random set of samples in the QSL that should fit in RAM when
+/// loaded together.
+struct LoadableSampleSet {
+  std::vector<QuerySampleIndex> set;
+  const size_t sample_distribution_end;  // Excludes padding in multi-stream.
+};
+
+/// \brief Generates nanoseconds from a start time to multiple end times.
+/// TODO: This isn't very useful anymore. Remove it.
+struct DurationGeneratorNs {
+  const PerfClock::time_point start;
+  int64_t delta(PerfClock::time_point end) const {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
+        .count();
+  }
+};
+
+/// \brief ResponseDelegate implementation templated by scenario and mode.
+template <TestScenario scenario, TestMode mode>
+struct ResponseDelegateDetailed : public ResponseDelegate {
+  double accuracy_log_offset = 0.0f;
+  double accuracy_log_prob = 0.0f;
+
+  void SampleComplete(SampleMetadata* sample, QuerySampleResponse* response,
+                      PerfClock::time_point complete_begin_time, const ResponseCallback& response_cb) override {
+    // Using a raw pointer here should help us hit the std::function
+    // small buffer optimization code path when we aren't copying data.
+    // For some reason, using std::unique_ptr<std::vector> wasn't moving
+    // into the lambda; even with C++14.
+    std::vector<uint8_t>* sample_data_copy = nullptr;
+    double accuracy_log_val =
+        sample->accuracy_log_val + accuracy_log_offset < 1.0
+            ? sample->accuracy_log_val + accuracy_log_offset
+            : sample->accuracy_log_val + accuracy_log_offset - 1.0;
+    if (mode == TestMode::AccuracyOnly ||
+        accuracy_log_val <= accuracy_log_prob) {
+      // if a response_cb callback is provided, data only needs to reside on the host *after* calling it
+      // note that the callback is blocking and will likely involve a memcpy from accelerator to host
+      if (response_cb) {
+        response_cb(response);
+      }
+      // TODO: Verify accuracy with the data copied here.
+      uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
+      uint8_t* src_end = src_begin + response->size;
+      sample_data_copy = new std::vector<uint8_t>(src_begin, src_end);
+    }
+    Log([sample, complete_begin_time, sample_data_copy](AsyncLog& log) {
+      QueryMetadata* query = sample->query_metadata;
+      DurationGeneratorNs sched{query->scheduled_time};
+
+      if (scenario == TestScenario::Server) {
+        // Trace the server scenario as a stacked graph via counter events.
+        DurationGeneratorNs issued{query->issued_start_time};
+        log.TraceCounterEvent("Latency", query->scheduled_time, "issue_delay",
+                              sched.delta(query->issued_start_time),
+                              "issue_to_done",
+                              issued.delta(complete_begin_time));
+      }
+
+      // While visualizing overlapping samples in offline mode is not
+      // practical, sample completion is still recorded for auditing purposes.
+      log.TraceSample("Sample", sample->sequence_id, query->scheduled_time,
+                      complete_begin_time, "sample_seq", sample->sequence_id,
+                      "query_seq", query->sequence_id, "sample_idx",
+                      sample->sample_index, "issue_start_ns",
+                      sched.delta(query->issued_start_time), "complete_ns",
+                      sched.delta(complete_begin_time));
+
+      if (sample_data_copy) {
+        log.LogAccuracy(sample->sequence_id, sample->sample_index,
+                        LogBinaryAsHexString{sample_data_copy});
+        delete sample_data_copy;
+      }
+
+      // Record the latency at the end, since it will unblock the issuing
+      // thread and potentially destroy the metadata being used above.
+      QuerySampleLatency latency = sched.delta(complete_begin_time);
+      log.RecordSampleCompletion(sample->sequence_id, complete_begin_time,
+                                 latency);
+    });
+  }
+
+  void QueryComplete() override {
+    // We only need to track outstanding queries in the server scenario to
+    // detect when the SUT has fallen too far behind.
+    if (scenario == TestScenario::Server) {
+      queries_completed.fetch_add(1, std::memory_order_relaxed);
+    }
+  }
+};
+
+/// \brief Selects the query timestamps for all scenarios except Server.
+template <TestScenario scenario>
+auto ScheduleDistribution(double qps) {
+  return [period = std::chrono::duration_cast<std::chrono::nanoseconds>(
+              std::chrono::duration<double>(1.0 / qps))](auto& /*gen*/) {
+    return period;
+  };
+}
+
+/// \brief Selects the query timestamps for the Server scenario.
+template <>
+auto ScheduleDistribution<TestScenario::Server>(double qps) {
+  // Poisson arrival process corresponds to exponentially distributed
+  // interarrival times.
+  return [dist = std::exponential_distribution<>(qps)](auto& gen) mutable {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::duration<double>(dist(gen)));
+  };
+}
+
+/// \brief Selects samples for the accuracy mode.
+template <TestMode mode>
+auto SampleDistribution(size_t sample_count, size_t stride, std::mt19937* rng) {
+  std::vector<size_t> indices;
+  for (size_t i = 0; i < sample_count; i += stride) {
+    indices.push_back(i);
+  }
+  std::shuffle(indices.begin(), indices.end(), *rng);
+  return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
+    return indices.at(i++);
+  };
+}
+
+/// \brief Selects samples for the performance mode.
+template <>
+auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
+                                                   size_t /*stride*/,
+                                                   std::mt19937* /*rng*/) {
+  return [dist = std::uniform_int_distribution<>(0, sample_count - 1)](
+             auto& gen) mutable { return dist(gen); };
+}
+
+/// \brief Generates queries for the requested settings, templated by
+/// scenario and mode.
+/// \todo Make GenerateQueries faster.
+/// QueryMetadata is expensive to move; either reserve queries in advance
+/// so the queries vector doesn't need to grow. And/or parent samples to their
+/// queries only after all queries have been generated.
+/// \todo For the server scenario only, scale the query timeline at the end so
+/// the QPS as scheduled is equal to the QPS as requested.
+template <TestScenario scenario, TestMode mode>
+std::vector<QueryMetadata> GenerateQueries(
+    const TestSettingsInternal& settings,
+    const LoadableSampleSet& loaded_sample_set, SequenceGen* sequence_gen,
+    ResponseDelegate* response_delegate) {
+  auto tracer =
+      MakeScopedTracer([](AsyncTrace& trace) { trace("GenerateQueries"); });
+
+  auto& loaded_samples = loaded_sample_set.set;
+
+  // Generate 2x more samples than we think we'll need given the expected
+  // QPS in case the SUT is faster than expected.
+  // We should exit before issuing all queries.
+  // Does not apply to the server scenario since the duration only
+  // depends on the ideal scheduled time, not the actual issue time.
+  const int duration_multiplier = scenario == TestScenario::Server ? 1 : 2;
+  std::chrono::microseconds gen_duration =
+      duration_multiplier * settings.target_duration;
+  size_t min_queries = settings.min_query_count;
+
+  size_t samples_per_query = settings.samples_per_query;
+  if (mode == TestMode::AccuracyOnly && scenario == TestScenario::Offline) {
+    samples_per_query = loaded_sample_set.sample_distribution_end;
+  }
+
+  // We should not exit early in accuracy mode.
+  if (mode == TestMode::AccuracyOnly || settings.performance_issue_unique ||
+      settings.performance_issue_same) {
+    gen_duration = std::chrono::microseconds(0);
+    // Integer truncation here is intentional.
+    // For MultiStream, loaded samples is properly padded.
+    // For Offline, we create a 'remainder' query at the end of this function.
+    min_queries = loaded_samples.size() / samples_per_query;
+  }
+
+  std::vector<QueryMetadata> queries;
+
+  // Using the std::mt19937 pseudo-random number generator ensures a modicum of
+  // cross platform reproducibility for trace generation.
+  std::mt19937 sample_rng(settings.sample_index_rng_seed);
+  std::mt19937 schedule_rng(settings.schedule_rng_seed);
+
+  constexpr bool kIsMultiStream = scenario == TestScenario::MultiStream ||
+                                  scenario == TestScenario::MultiStreamFree;
+  const size_t sample_stride = kIsMultiStream ? samples_per_query : 1;
+
+  auto sample_distribution = SampleDistribution<mode>(
+      loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
+  // Use the unique sample distribution same as in AccuracyMode to
+  // to choose samples when either flag performance_issue_unique
+  // or performance_issue_same is set.
+  auto sample_distribution_unique = SampleDistribution<TestMode::AccuracyOnly>(
+      loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
+
+  auto schedule_distribution =
+      ScheduleDistribution<scenario>(settings.target_qps);
+
+  std::vector<QuerySampleIndex> samples(samples_per_query);
+  std::chrono::nanoseconds timestamp(0);
+  std::chrono::nanoseconds prev_timestamp(0);
+  // Choose a single sample to repeat when in performance_issue_same mode
+  QuerySampleIndex same_sample = settings.performance_issue_same_index;
+
+  while (prev_timestamp < gen_duration || queries.size() < min_queries) {
+    if (kIsMultiStream) {
+      QuerySampleIndex sample_i = settings.performance_issue_unique
+                                      ? sample_distribution_unique(sample_rng)
+                                      : settings.performance_issue_same
+                                            ? same_sample
+                                            : sample_distribution(sample_rng);
+      for (auto& s : samples) {
+        // Select contiguous samples in the MultiStream scenario.
+        // This will not overflow, since GenerateLoadableSets adds padding at
+        // the end of the loadable sets in the MultiStream scenario.
+        // The padding allows the starting samples to be the same for each
+        // query as the value of samples_per_query increases.
+        s = loaded_samples[sample_i++];
+      }
+    } else if (scenario == TestScenario::Offline) {
+      // For the Offline + Performance scenario, we also want to support
+      // contiguous samples. In this scenario the query can be much larger than
+      // what fits into memory. We simply repeat loaded_samples N times, plus a
+      // remainder to ensure we fill up samples. Note that this eliminates
+      // randomization.
+      size_t num_loaded_samples = loaded_samples.size();
+      size_t num_full_repeats = samples_per_query / num_loaded_samples;
+      uint64_t remainder = samples_per_query % (num_loaded_samples);
+      if (settings.performance_issue_same) {
+        std::fill(samples.begin(), samples.begin() + num_loaded_samples,
+                  loaded_samples[same_sample]);
+      } else {
+        for (size_t i = 0; i < num_full_repeats; ++i) {
+          std::copy(loaded_samples.begin(), loaded_samples.end(),
+                    samples.begin() + i * num_loaded_samples);
+        }
+
+        std::copy(loaded_samples.begin(), loaded_samples.begin() + remainder,
+                  samples.begin() + num_full_repeats * num_loaded_samples);
+      }
+    } else {
+      for (auto& s : samples) {
+        s = loaded_samples[settings.performance_issue_unique
+                               ? sample_distribution_unique(sample_rng)
+                               : settings.performance_issue_same
+                                     ? same_sample
+                                     : sample_distribution(sample_rng)];
+      }
+    }
+    queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
+    prev_timestamp = timestamp;
+    timestamp += schedule_distribution(schedule_rng);
+  }
+
+  // See if we need to create a "remainder" query for offline+accuracy to
+  // ensure we issue all samples in loaded_samples. Offline doesn't pad
+  // loaded_samples like MultiStream does.
+  if (scenario == TestScenario::Offline && mode == TestMode::AccuracyOnly) {
+    size_t remaining_samples = loaded_samples.size() % samples_per_query;
+    if (remaining_samples != 0) {
+      samples.resize(remaining_samples);
+      for (auto& s : samples) {
+        s = loaded_samples[sample_distribution(sample_rng)];
+      }
+      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
+    }
+  }
+
+  LogDetail([count = queries.size(), spq = settings.samples_per_query,
+             duration = timestamp.count()](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generated_query_count", count);
+    MLPERF_LOG(detail, "generated_samples_per_query", spq);
+    MLPERF_LOG(detail, "generated_query_duration", duration);
+#else
+    detail("GeneratedQueries: ", "queries", count, "samples per query", spq,
+           "duration", duration);
+#endif
+  });
+
+  return queries;
+}
+
+/// \brief Provides performance results that are independent of scenario
+/// and other context.
+/// \todo Move to results.h/cc
+struct PerformanceResult {
+  std::vector<QuerySampleLatency> sample_latencies;
+  std::vector<QuerySampleLatency> query_latencies;  // MultiStream only.
+  std::vector<size_t> query_intervals;              // MultiStream only.
+  size_t queries_issued;
+  double max_latency;
+  double final_query_scheduled_time;         // seconds from start.
+  double final_query_issued_time;            // seconds from start.
+  double final_query_all_samples_done_time;  // seconds from start.
+};
+
+/// \brief Issues a series of pre-generated queries.
+// TODO: Templates for scenario and mode are overused, given the loadgen
+//       no longer generates queries on the fly. Should we reduce the
+//       use of templates?
+template <TestScenario scenario, TestMode mode>
+PerformanceResult IssueQueries(SystemUnderTest* sut,
+                               const TestSettingsInternal& settings,
+                               const LoadableSampleSet& loaded_sample_set,
+                               SequenceGen* sequence_gen) {
+  // Create reponse handler.
+  ResponseDelegateDetailed<scenario, mode> response_logger;
+  std::uniform_real_distribution<double> accuracy_log_offset_dist =
+      std::uniform_real_distribution<double>(0.0, 1.0);
+  std::mt19937 accuracy_log_offset_rng(settings.accuracy_log_rng_seed);
+  response_logger.accuracy_log_offset =
+      accuracy_log_offset_dist(accuracy_log_offset_rng);
+  response_logger.accuracy_log_prob = settings.accuracy_log_probability;
+
+  // Generate queries.
+  auto sequence_id_start = sequence_gen->CurrentSampleId();
+  std::vector<QueryMetadata> queries = GenerateQueries<scenario, mode>(
+      settings, loaded_sample_set, sequence_gen, &response_logger);
+
+  // Calculated expected number of queries
+  uint64_t expected_queries =
+      settings.target_qps * settings.min_duration.count() / 1000;
+  uint64_t minimum_queries =
+      settings.min_query_count * settings.samples_per_query;
+  if (scenario != TestScenario::Offline) {
+    expected_queries *= settings.samples_per_query;
+  } else {
+    minimum_queries = settings.min_sample_count;
+  }
+
+  expected_queries =
+      expected_queries < minimum_queries ? minimum_queries : expected_queries;
+
+  if (settings.accuracy_log_sampling_target > 0) {
+    response_logger.accuracy_log_prob =
+        (double)settings.accuracy_log_sampling_target / expected_queries;
+  }
+  auto sequence_id_end = sequence_gen->CurrentSampleId();
+  size_t max_latencies_to_record = sequence_id_end - sequence_id_start;
+
+  // Initialize logger for latency recording.
+  GlobalLogger().RestartLatencyRecording(sequence_id_start,
+                                         max_latencies_to_record);
+
+  // Create and initialize an IssueQueryState.
+  IssueQueryState state{
+      sut, &queries, &response_logger, &settings, mode, {}, {}, false, 0,
+      0,   {}};
+  auto& controller = IssueQueryController::GetInstance();
+
+  // Set number of IssueQueryThreads and wait for the threads to register.
+  controller.SetNumThreads(settings.requested.server_num_issue_query_threads);
+
+  // Start issuing the queries.
+  controller.StartIssueQueries<scenario>(&state);
+
+  // Gather query issuing statistics.
+  const auto start_for_power = state.start_for_power;
+  const auto start = state.start_time;
+  const auto ran_out_of_generated_queries = state.ran_out_of_generated_queries;
+  const auto queries_issued = state.queries_issued;
+  const auto expected_latencies = state.expected_latencies;
+
+  // Let the SUT know it should not expect any more queries.
+  sut->FlushQueries();
+
+  if (mode == TestMode::PerformanceOnly && ran_out_of_generated_queries) {
+    LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(
+          detail, "error_runtime",
+          "Ending early: Ran out of generated queries to issue before the "
+          "minimum query count and test duration were reached. "
+          "Please update the relevant expected latency or target qps in the "
+          "TestSettings so they are more accurate.");
+#else
+      detail.Error(
+          "Ending early: Ran out of generated queries to issue before the "
+          "minimum query count and test duration were reached.");
+      detail(
+          "Please update the relevant expected latency or target qps in the "
+          "TestSettings so they are more accurate.");
+#endif
+    });
+  }
+
+  // Wait for tail queries to complete and collect all the latencies.
+  // We have to keep the synchronization primitives alive until the SUT
+  // is done with them.
+  auto& final_query = queries[queries_issued - 1];
+  std::vector<QuerySampleLatency> sample_latencies(
+      GlobalLogger().GetLatenciesBlocking(expected_latencies));
+
+  // Log contention counters after every test as a sanity check.
+  GlobalLogger().LogContentionAndAllocations();
+
+  // This properly accounts for the fact that the max completion time may not
+  // belong to the final query. It also excludes any time spent postprocessing
+  // in the loadgen itself after final completion, which may be significant
+  // in the offline scenario.
+  PerfClock::time_point max_completion_time =
+      GlobalLogger().GetMaxCompletionTime();
+  auto sut_active_duration = max_completion_time - start;
+  LogDetail([start_for_power, sut_active_duration](AsyncDetail& detail) {
+    auto end_for_power =
+        start_for_power +
+        std::chrono::duration_cast<std::chrono::system_clock::duration>(
+            sut_active_duration);
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_INTERVAL_START(detail, "power_begin",
+                              DateTimeStringForPower(start_for_power));
+    MLPERF_LOG_INTERVAL_END(detail, "power_end",
+                            DateTimeStringForPower(end_for_power));
+#else
+    detail("POWER_BEGIN: ", "mode", ToString(mode), "time",
+           DateTimeStringForPower(start_for_power));
+    detail("POWER_END: ", "mode", ToString(mode), "time",
+           DateTimeStringForPower(end_for_power));
+#endif
+  });
+
+  double max_latency =
+      QuerySampleLatencyToSeconds(GlobalLogger().GetMaxLatencySoFar());
+  double final_query_scheduled_time =
+      DurationToSeconds(final_query.scheduled_delta);
+  double final_query_issued_time =
+      DurationToSeconds(final_query.issued_start_time - start);
+  double final_query_all_samples_done_time =
+      DurationToSeconds(final_query.all_samples_done_time - start);
+
+  std::vector<QuerySampleLatency> query_latencies;
+  std::vector<size_t> query_intervals;
+  if (scenario == TestScenario::MultiStream ||
+      scenario == TestScenario::MultiStreamFree) {
+    query_latencies.resize(queries_issued);
+    query_intervals.resize(queries_issued);
+    for (size_t i = 0; i < queries_issued; i++) {
+      query_latencies[i] = DurationGeneratorNs{queries[i].scheduled_time}.delta(
+          queries[i].all_samples_done_time);
+      if (i < queries_issued - settings.max_async_queries) {
+        // For all queries except the last few, take into account actual
+        // skipped intervals to the next query.
+        query_intervals[i] =
+            queries[i + settings.max_async_queries].scheduled_intervals;
+      } else {
+        // For the last queries, use query latency to guess if imaginary
+        // queries issued at the end would have skipped intervals.
+        query_intervals[i] =
+            std::ceil(settings.target_qps *
+                      QuerySampleLatencyToSeconds(query_latencies[i]));
+      }
+    }
+  }
+
+  return PerformanceResult{std::move(sample_latencies),
+                           std::move(query_latencies),
+                           std::move(query_intervals),
+                           queries_issued,
+                           max_latency,
+                           final_query_scheduled_time,
+                           final_query_issued_time,
+                           final_query_all_samples_done_time};
+}
+
+/// \brief Wraps PerformanceResult with relevant context to change how
+/// it's interpreted and reported.
+/// \todo Move to results.h/cc
+struct PerformanceSummary {
+  std::string sut_name;
+  TestSettingsInternal settings;
+  PerformanceResult pr;
+
+  // Set by ProcessLatencies.
+  size_t sample_count = 0;
+  QuerySampleLatency sample_latency_min = 0;
+  QuerySampleLatency sample_latency_max = 0;
+  QuerySampleLatency sample_latency_mean = 0;
+
+  /// \brief The latency at a given percentile.
+  struct PercentileEntry {
+    const double percentile;
+    QuerySampleLatency sample_latency = 0;
+    QuerySampleLatency query_latency = 0;  // MultiStream only.
+    size_t query_intervals = 0;            // MultiStream only.
+  };
+  // Latency target percentile
+  PercentileEntry target_latency_percentile{settings.target_latency_percentile};
+  PercentileEntry latency_percentiles[6] = {{.50}, {.90}, {.95},
+                                            {.97}, {.99}, {.999}};
+
+#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+  // MSVC complains if there is no explicit constructor.
+  // (target_latency_percentile above depends on construction with settings)
+  PerformanceSummary(const std::string& sut_name_arg,
+                     const TestSettingsInternal& settings_arg,
+                     const PerformanceResult& pr_arg)
+      : sut_name(sut_name_arg), settings(settings_arg), pr(pr_arg){};
+#endif
+  void ProcessLatencies();
+
+  bool MinDurationMet(std::string* recommendation);
+  bool MinQueriesMet();
+  bool MinSamplesMet();
+  bool HasPerfConstraints();
+  bool PerfConstraintsMet(std::string* recommendation);
+  void LogSummary(AsyncSummary& summary);
+  void LogDetail(AsyncDetail& detail);
+};
+
+void PerformanceSummary::ProcessLatencies() {
+  if (pr.sample_latencies.empty()) {
+    return;
+  }
+
+  sample_count = pr.sample_latencies.size();
+
+  QuerySampleLatency accumulated_latency = 0;
+  for (auto latency : pr.sample_latencies) {
+    accumulated_latency += latency;
+  }
+  sample_latency_mean = accumulated_latency / sample_count;
+
+  std::sort(pr.sample_latencies.begin(), pr.sample_latencies.end());
+
+  target_latency_percentile.sample_latency =
+      pr.sample_latencies[sample_count * target_latency_percentile.percentile];
+  sample_latency_min = pr.sample_latencies.front();
+  sample_latency_max = pr.sample_latencies.back();
+  for (auto& lp : latency_percentiles) {
+    assert(lp.percentile >= 0.0);
+    assert(lp.percentile < 1.0);
+    lp.sample_latency = pr.sample_latencies[sample_count * lp.percentile];
+  }
+
+  // MultiStream only after this point.
+  if (settings.scenario != TestScenario::MultiStream &&
+      settings.scenario != TestScenario::MultiStreamFree) {
+    return;
+  }
+
+  // Calculate per-query stats.
+  size_t query_count = pr.queries_issued;
+  assert(pr.query_latencies.size() == query_count);
+  assert(pr.query_intervals.size() == query_count);
+  std::sort(pr.query_latencies.begin(), pr.query_latencies.end());
+  std::sort(pr.query_intervals.begin(), pr.query_intervals.end());
+  target_latency_percentile.query_latency =
+      pr.query_latencies[query_count * target_latency_percentile.percentile];
+  target_latency_percentile.query_intervals =
+      pr.query_intervals[query_count * target_latency_percentile.percentile];
+  for (auto& lp : latency_percentiles) {
+    lp.query_latency = pr.query_latencies[query_count * lp.percentile];
+    lp.query_intervals = pr.query_intervals[query_count * lp.percentile];
+  }
+}
+
+bool PerformanceSummary::MinDurationMet(std::string* recommendation) {
+  recommendation->clear();
+  const double min_duration = DurationToSeconds(settings.min_duration);
+  bool min_duration_met = false;
+  switch (settings.scenario) {
+    case TestScenario::Offline:
+      min_duration_met = pr.max_latency >= min_duration;
+      break;
+    case TestScenario::Server:
+      min_duration_met = pr.final_query_scheduled_time >= min_duration;
+      break;
+    case TestScenario::SingleStream:
+    case TestScenario::MultiStream:
+    case TestScenario::MultiStreamFree:
+      min_duration_met = pr.final_query_issued_time >= min_duration;
+      break;
+  }
+  if (min_duration_met) {
+    return true;
+  }
+
+  switch (settings.scenario) {
+    case TestScenario::SingleStream:
+      *recommendation =
+          "Decrease the expected latency so the loadgen pre-generates more "
+          "queries.";
+      break;
+    case TestScenario::MultiStream:
+      *recommendation =
+          "MultiStream should always meet the minimum duration. "
+          "Please file a bug.";
+      break;
+    case TestScenario::MultiStreamFree:
+      *recommendation =
+          "Increase the target QPS so the loadgen pre-generates more queries.";
+      break;
+    case TestScenario::Server:
+      *recommendation =
+          "Increase the target QPS so the loadgen pre-generates more queries.";
+      break;
+    case TestScenario::Offline:
+      *recommendation =
+          "Increase expected QPS so the loadgen pre-generates a larger "
+          "(coalesced) query.";
+      break;
+  }
+  return false;
+}
+
+bool PerformanceSummary::MinQueriesMet() {
+  return pr.queries_issued >= settings.min_query_count;
+}
+
+bool PerformanceSummary::MinSamplesMet() {
+  return sample_count >= settings.min_sample_count;
+}
+
+bool PerformanceSummary::HasPerfConstraints() {
+  return settings.scenario == TestScenario::MultiStream ||
+         settings.scenario == TestScenario::MultiStreamFree ||
+         settings.scenario == TestScenario::Server;
+}
+
+bool PerformanceSummary::PerfConstraintsMet(std::string* recommendation) {
+  recommendation->clear();
+  bool perf_constraints_met = true;
+  switch (settings.scenario) {
+    case TestScenario::SingleStream:
+      break;
+    case TestScenario::MultiStream:
+      ProcessLatencies();
+      if (target_latency_percentile.query_intervals >= 2) {
+        *recommendation = "Reduce samples per query to improve latency.";
+        perf_constraints_met = false;
+      }
+      break;
+    case TestScenario::MultiStreamFree:
+      ProcessLatencies();
+      if (target_latency_percentile.query_latency >
+          settings.target_latency.count()) {
+        *recommendation = "Reduce samples per query to improve latency.";
+        perf_constraints_met = false;
+      }
+      break;
+    case TestScenario::Server:
+      ProcessLatencies();
+      if (target_latency_percentile.sample_latency >
+          settings.target_latency.count()) {
+        *recommendation = "Reduce target QPS to improve latency.";
+        perf_constraints_met = false;
+      }
+      break;
+    case TestScenario::Offline:
+      break;
+  }
+  return perf_constraints_met;
+}
+
+void PerformanceSummary::LogSummary(AsyncSummary& summary) {
+  ProcessLatencies();
+
+  summary(
+      "================================================\n"
+      "MLPerf Results Summary\n"
+      "================================================");
+  summary("SUT name : ", sut_name);
+  summary("Scenario : ", ToString(settings.scenario));
+  summary("Mode     : ", ToString(settings.mode));
+
+  switch (settings.scenario) {
+    case TestScenario::SingleStream: {
+      summary(DoubleToString(target_latency_percentile.percentile * 100, 0) +
+                  "th percentile latency (ns) : ",
+              target_latency_percentile.sample_latency);
+      break;
+    }
+    case TestScenario::MultiStream: {
+      summary("Samples per query : ", settings.samples_per_query);
+      break;
+    }
+    case TestScenario::MultiStreamFree: {
+      double samples_per_second = pr.queries_issued *
+                                  settings.samples_per_query /
+                                  pr.final_query_all_samples_done_time;
+      summary("Samples per second : ", samples_per_second);
+      break;
+    }
+    case TestScenario::Server: {
+      // Subtract 1 from sample count since the start of the final sample
+      // represents the open end of the time range: i.e. [begin, end).
+      // This makes sense since:
+      // a) QPS doesn't apply if there's only one sample; it's pure latency.
+      // b) If you have precisely 1k QPS, there will be a sample exactly on
+      //    the 1 second time point; but that would be the 1001th sample in
+      //    the stream. Given the first 1001 queries, the QPS is
+      //    1000 queries / 1 second.
+      double qps_as_scheduled =
+          (sample_count - 1) / pr.final_query_scheduled_time;
+      summary("Scheduled samples per second : ",
+              DoubleToString(qps_as_scheduled));
+      break;
+    }
+    case TestScenario::Offline: {
+      double samples_per_second = sample_count / pr.max_latency;
+      summary("Samples per second: ", samples_per_second);
+      break;
+    }
+  }
+
+  std::string min_duration_recommendation;
+  std::string perf_constraints_recommendation;
+
+  bool min_duration_met = MinDurationMet(&min_duration_recommendation);
+  bool min_queries_met = MinQueriesMet() && MinSamplesMet();
+  bool perf_constraints_met =
+      PerfConstraintsMet(&perf_constraints_recommendation);
+  bool all_constraints_met =
+      min_duration_met && min_queries_met && perf_constraints_met;
+  summary("Result is : ", all_constraints_met ? "VALID" : "INVALID");
+  if (HasPerfConstraints()) {
+    summary("  Performance constraints satisfied : ",
+            perf_constraints_met ? "Yes" : "NO");
+  }
+  summary("  Min duration satisfied : ", min_duration_met ? "Yes" : "NO");
+  summary("  Min queries satisfied : ", min_queries_met ? "Yes" : "NO");
+
+  if (!all_constraints_met) {
+    summary("Recommendations:");
+    if (!perf_constraints_met) {
+      summary(" * " + perf_constraints_recommendation);
+    }
+    if (!min_duration_met) {
+      summary(" * " + min_duration_recommendation);
+    }
+    if (!min_queries_met) {
+      summary(
+          " * The test exited early, before enough queries were issued.\n"
+          "   See the detailed log for why this may have occurred.");
+    }
+  }
+
+  summary(
+      "\n"
+      "================================================\n"
+      "Additional Stats\n"
+      "================================================");
+
+  if (settings.scenario == TestScenario::SingleStream) {
+    double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
+    double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
+    summary("QPS w/ loadgen overhead         : " + DoubleToString(qps_w_lg));
+    summary("QPS w/o loadgen overhead        : " + DoubleToString(qps_wo_lg));
+    summary("");
+  } else if (settings.scenario == TestScenario::Server) {
+    double qps_as_completed =
+        (sample_count - 1) / pr.final_query_all_samples_done_time;
+    summary("Completed samples per second    : ",
+            DoubleToString(qps_as_completed));
+    summary("");
+  } else if (settings.scenario == TestScenario::MultiStream ||
+             settings.scenario == TestScenario::MultiStreamFree) {
+    double ms_per_interval = std::milli::den / settings.target_qps;
+    summary("Intervals between each IssueQuery:  ", "qps", settings.target_qps,
+            "ms", ms_per_interval);
+    for (auto& lp : latency_percentiles) {
+      summary(DoubleToString(lp.percentile * 100) + " percentile : ",
+              lp.query_intervals);
+    }
+
+    summary("");
+    double target_ns = settings.target_latency.count();
+    double target_ms = target_ns * std::milli::den / std::nano::den;
+    summary("Per-query latency:  ", "target_ns",
+            settings.target_latency.count(), "target_ms", target_ms);
+    for (auto& lp : latency_percentiles) {
+      summary(
+          DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
+          lp.query_latency);
+    }
+
+    summary("");
+    summary("Per-sample latency:");
+  }
+
+  summary("Min latency (ns)                : ", sample_latency_min);
+  summary("Max latency (ns)                : ", sample_latency_max);
+  summary("Mean latency (ns)               : ", sample_latency_mean);
+  for (auto& lp : latency_percentiles) {
+    summary(
+        DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
+        lp.sample_latency);
+  }
+
+  summary(
+      "\n"
+      "================================================\n"
+      "Test Parameters Used\n"
+      "================================================");
+  settings.LogSummary(summary);
+}
+
+void PerformanceSummary::LogDetail(AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+  ProcessLatencies();
+
+  // General validity checking
+  std::string min_duration_recommendation;
+  std::string perf_constraints_recommendation;
+  bool min_duration_met = MinDurationMet(&min_duration_recommendation);
+  bool min_queries_met = MinQueriesMet() && MinSamplesMet();
+  bool perf_constraints_met =
+      PerfConstraintsMet(&perf_constraints_recommendation);
+  bool all_constraints_met =
+      min_duration_met && min_queries_met && perf_constraints_met;
+
+  MLPERF_LOG(detail, "result_validity",
+             all_constraints_met ? "VALID" : "INVALID");
+  if (HasPerfConstraints()) {
+    MLPERF_LOG(detail, "result_perf_constraints_met", perf_constraints_met);
+  }
+  MLPERF_LOG(detail, "result_min_duration_met", min_duration_met);
+  MLPERF_LOG(detail, "result_min_queries_met", min_queries_met);
+  if (!all_constraints_met) {
+    std::string recommendation;
+    if (!perf_constraints_met) {
+      recommendation += perf_constraints_recommendation + " ";
+    }
+    if (!min_duration_met) {
+      recommendation += min_duration_recommendation + " ";
+    }
+    if (!min_queries_met) {
+      recommendation +=
+          "The test exited early, before enough queries were issued.";
+    }
+    MLPERF_LOG(detail, "result_invalid_reason", recommendation);
+  }
+
+  auto reportPerQueryLatencies = [&]() {
+    for (auto& lp : latency_percentiles) {
+      std::string percentile = DoubleToString(lp.percentile * 100);
+      MLPERF_LOG(detail,
+                  "result_" + percentile +
+                      "_percentile_num_intervals_between_queries",
+                  lp.query_intervals);
+      MLPERF_LOG(detail,
+                  "result_" + percentile + "_percentile_per_query_latency_ns",
+                  lp.query_latency);
+    }
+  };
+
+  // Per-scenario performance results.
+  switch (settings.scenario) {
+    case TestScenario::SingleStream: {
+      double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
+      double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
+      MLPERF_LOG(detail, "result_qps_with_loadgen_overhead", qps_w_lg);
+      MLPERF_LOG(detail, "result_qps_without_loadgen_overhead", qps_wo_lg);
+      break;
+    }
+    case TestScenario::MultiStreamFree: {
+      double samples_per_second = pr.queries_issued *
+                                  settings.samples_per_query /
+                                  pr.final_query_all_samples_done_time;
+      MLPERF_LOG(detail, "result_samples_per_second", samples_per_second);
+      reportPerQueryLatencies();
+      break;
+    }
+    case TestScenario::MultiStream: {
+      reportPerQueryLatencies();
+      break;
+    }
+    case TestScenario::Server: {
+      // Subtract 1 from sample count since the start of the final sample
+      // represents the open end of the time range: i.e. [begin, end).
+      // This makes sense since:
+      // a) QPS doesn't apply if there's only one sample; it's pure latency.
+      // b) If you have precisely 1k QPS, there will be a sample exactly on
+      //    the 1 second time point; but that would be the 1001th sample in
+      //    the stream. Given the first 1001 queries, the QPS is
+      //    1000 queries / 1 second.
+      double qps_as_scheduled =
+          (sample_count - 1) / pr.final_query_scheduled_time;
+      MLPERF_LOG(detail, "result_scheduled_samples_per_sec", qps_as_scheduled);
+      double qps_as_completed =
+          (sample_count - 1) / pr.final_query_all_samples_done_time;
+      MLPERF_LOG(detail, "result_completed_samples_per_sec", qps_as_completed);
+      break;
+    }
+    case TestScenario::Offline: {
+      double samples_per_second = sample_count / pr.max_latency;
+      MLPERF_LOG(detail, "result_samples_per_second", samples_per_second);
+      break;
+    }
+  }
+
+  // Detailed latencies
+  MLPERF_LOG(detail, "result_min_latency_ns", sample_latency_min);
+  MLPERF_LOG(detail, "result_max_latency_ns", sample_latency_max);
+  MLPERF_LOG(detail, "result_mean_latency_ns", sample_latency_mean);
+  for (auto& lp : latency_percentiles) {
+    MLPERF_LOG(detail,
+               "result_" + DoubleToString(lp.percentile * 100) +
+                   "_percentile_latency_ns",
+               lp.sample_latency);
+  }
+#endif
+}
+
+void LoadSamplesToRam(QuerySampleLibrary* qsl,
+                      const std::vector<QuerySampleIndex>& samples) {
+  LogDetail([&samples](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "loaded_qsl_set", samples);
+#else
+    std::string set("\"[");
+    for (auto i : samples) {
+      set += std::to_string(i) + ",";
+    }
+    set.resize(set.size() - 1);
+    set += "]\"";
+    detail("Loading QSL : ", "set", set);
+#endif
+  });
+  qsl->LoadSamplesToRam(samples);
+}
+
+/// \brief Generates random sets of samples in the QSL that we can load into
+/// RAM at the same time.
+std::vector<LoadableSampleSet> GenerateLoadableSets(
+    QuerySampleLibrary* qsl, const TestSettingsInternal& settings) {
+  auto tracer = MakeScopedTracer(
+      [](AsyncTrace& trace) { trace("GenerateLoadableSets"); });
+
+  std::vector<LoadableSampleSet> result;
+  std::mt19937 qsl_rng(settings.qsl_rng_seed);
+
+  // Generate indices for all available samples in the QSL.
+  const size_t qsl_total_count = qsl->TotalSampleCount();
+  std::vector<QuerySampleIndex> samples(qsl_total_count);
+  for (size_t i = 0; i < qsl_total_count; i++) {
+    samples[i] = static_cast<QuerySampleIndex>(i);
+  }
+
+  // Randomize the order of the samples.
+  std::shuffle(samples.begin(), samples.end(), qsl_rng);
+
+  // Partition the samples into loadable sets.
+  const size_t set_size = settings.performance_sample_count;
+  const size_t set_padding =
+      (settings.scenario == TestScenario::MultiStream ||
+       settings.scenario == TestScenario::MultiStreamFree)
+          ? settings.samples_per_query - 1
+          : 0;
+  std::vector<QuerySampleIndex> loadable_set;
+  loadable_set.reserve(set_size + set_padding);
+
+  for (auto s : samples) {
+    loadable_set.push_back(s);
+    if (loadable_set.size() == set_size) {
+      result.push_back({std::move(loadable_set), set_size});
+      loadable_set.clear();
+      loadable_set.reserve(set_size + set_padding);
+    }
+  }
+
+  if (!loadable_set.empty()) {
+    // Copy the size since it will become invalid after the move.
+    size_t loadable_set_size = loadable_set.size();
+    result.push_back({std::move(loadable_set), loadable_set_size});
+  }
+
+  // Add padding for the multi stream scenario. Padding allows the
+  // startings sample to be the same for all SUTs, independent of the value
+  // of samples_per_query, while enabling samples in a query to be contiguous.
+  for (auto& loadable_set : result) {
+    auto& set = loadable_set.set;
+    for (size_t i = 0; i < set_padding; i++) {
+      // It's not clear in the spec if the STL deallocates the old container
+      // before assigning, which would invalidate the source before the
+      // assignment happens. Even though we should have reserved enough
+      // elements above, copy the source first anyway since we are just moving
+      // integers around.
+      QuerySampleIndex p = set[i];
+      set.push_back(p);
+    }
+  }
+
+  return result;
+}
+
+/// \brief Opens and owns handles to all of the log files.
+struct LogOutputs {
+  LogOutputs(const LogOutputSettings& output_settings,
+             const std::string& test_date_time) {
+    std::string prefix = output_settings.outdir;
+    prefix += "/" + output_settings.prefix;
+    if (output_settings.prefix_with_datetime) {
+      prefix += test_date_time + "_";
+    }
+    const std::string& suffix = output_settings.suffix;
+
+    summary_out.open(prefix + "summary" + suffix + ".txt");
+    detail_out.open(prefix + "detail" + suffix + ".txt");
+    accuracy_out.open(prefix + "accuracy" + suffix + ".json");
+    trace_out.open(prefix + "trace" + suffix + ".json");
+  }
+
+  bool CheckOutputs() {
+    bool all_ofstreams_good = true;
+    if (!summary_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open summary file.";
+    }
+    if (!detail_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open detailed log file.";
+    }
+    if (!accuracy_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open accuracy log file.";
+    }
+    if (!trace_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open trace file.";
+    }
+    return all_ofstreams_good;
+  }
+
+  std::ofstream summary_out;
+  std::ofstream detail_out;
+  std::ofstream accuracy_out;
+  std::ofstream trace_out;
+};
+
+/// \brief Find boundaries of performance settings by widening bounds
+/// exponentially.
+/// \details To find an upper bound of performance, widen an
+/// upper bound exponentially until finding a bound that can't satisfy
+/// performance constraints. i.e. [1, 2) -> [2, 4) -> [4, 8) -> ...
+template <TestScenario scenario>
+std::pair<PerformanceSummary, PerformanceSummary> FindBoundaries(
+    SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
+    PerformanceSummary l_perf_summary) {
+  // Get upper bound
+  TestSettingsInternal u_settings = l_perf_summary.settings;
+  find_peak_performance::WidenPerformanceField<scenario>(&u_settings);
+
+  LogDetail(
+      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           l_perf_summary.settings),
+       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           u_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(detail, "generic_message",
+                   "FindBoundaries: Checking fields [" + l_field + ", " +
+                       u_field + ")");
+#else
+        detail("FindBoundaries: Checking fields [" + l_field + ", " + u_field +
+               ")");
+#endif
+      });
+
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, u_settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  PerformanceResult u_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, u_settings, performance_set, sequence_gen));
+  PerformanceSummary u_perf_summary{sut->Name(), u_settings, std::move(u_pr)};
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+
+  std::string tmp;
+  if (!u_perf_summary.PerfConstraintsMet(&tmp)) {
+    return std::make_pair(l_perf_summary, u_perf_summary);
+  } else {
+    return FindBoundaries<scenario>(sut, qsl, sequence_gen, u_perf_summary);
+  }
+}
+
+/// \brief Find peak performance by binary search.
+/// \details The found lower & upper bounds by the function 'FindBoundaries' are
+/// used as initial bounds of binary search
+template <TestScenario scenario>
+PerformanceSummary FindPeakPerformanceBinarySearch(
+    SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
+    const LoadableSampleSet& performance_set, PerformanceSummary l_perf_summary,
+    PerformanceSummary u_perf_summary) {
+  if (find_peak_performance::IsFinished<scenario>(l_perf_summary.settings,
+                                                  u_perf_summary.settings)) {
+    return l_perf_summary;
+  }
+
+  const TestSettingsInternal m_settings =
+      find_peak_performance::MidOfBoundaries<scenario>(l_perf_summary.settings,
+                                                       u_perf_summary.settings);
+
+  LogDetail([l_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     l_perf_summary.settings),
+             u_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     u_perf_summary.settings),
+             m_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     m_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(
+        detail, "generic_message",
+        "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
+            l_field + ", " + u_field + "): " + m_field);
+#else
+    detail(
+        "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
+        l_field + ", " + u_field + "): " + m_field);
+#endif
+  });
+
+  PerformanceResult m_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, m_settings, performance_set, sequence_gen));
+  PerformanceSummary m_perf_summary{sut->Name(), m_settings, std::move(m_pr)};
+
+  std::string tmp;
+  if (m_perf_summary.PerfConstraintsMet(&tmp)) {
+    return FindPeakPerformanceBinarySearch<scenario>(
+        sut, qsl, sequence_gen, performance_set, m_perf_summary,
+        u_perf_summary);
+  } else {
+    return FindPeakPerformanceBinarySearch<scenario>(
+        sut, qsl, sequence_gen, performance_set, l_perf_summary,
+        m_perf_summary);
+  }
+}
+
+/// \brief Runs the performance mode, templated by scenario.
+template <TestScenario scenario>
+void RunPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                        const TestSettingsInternal& settings,
+                        SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting performance mode");
+#else
+    detail("Starting performance mode:");
+#endif
+  });
+
+  // Use first loadable set as the performance set.
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  // Start PerfClock/system_clock timers for measuring performance interval
+  // for comparison vs external timer.
+  auto pc_start_ts = PerfClock::now();
+  auto sc_start_ts = std::chrono::system_clock::now();
+  if (settings.print_timestamps) {
+    std::cout << "Loadgen :: Perf mode start. system_clock Timestamp = "
+              << std::chrono::system_clock::to_time_t(sc_start_ts) << "\n"
+              << std::flush;
+  }
+
+  PerformanceResult pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, settings, performance_set, sequence_gen));
+
+  // Measure PerfClock/system_clock timer durations for comparison vs
+  // external timer.
+  auto pc_stop_ts = PerfClock::now();
+  auto sc_stop_ts = std::chrono::system_clock::now();
+  auto pc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+                         pc_stop_ts - pc_start_ts)
+                         .count();
+  auto sc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+                         sc_stop_ts - sc_start_ts)
+                         .count();
+  float pc_sc_ratio = static_cast<float>(pc_duration) / sc_duration;
+  if (settings.print_timestamps) {
+    std::cout << "Loadgen :: Perf mode stop. systme_clock Timestamp = "
+              << std::chrono::system_clock::to_time_t(sc_stop_ts) << "\n"
+              << std::flush;
+    std::cout << "Loadgen :: PerfClock Perf duration = " << pc_duration
+              << "ms\n"
+              << std::flush;
+    std::cout << "Loadgen :: system_clock Perf duration = " << sc_duration
+              << "ms\n"
+              << std::flush;
+    std::cout << "Loadgen :: PerfClock/system_clock ratio = " << std::fixed
+              << std::setprecision(4) << pc_sc_ratio << "\n"
+              << std::flush;
+  }
+
+  if (pc_sc_ratio > 1.01 || pc_sc_ratio < 0.99) {
+    LogDetail([pc_sc_ratio](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "PerfClock and system_clock differ by more than 1%! "
+         << " pc_sc_ratio: " << pc_sc_ratio;
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error("PerfClock and system_clock differ by more than 1\%! ",
+                   "pc_sc_ratio", pc_sc_ratio);
+#endif
+    });
+  } else if (pc_sc_ratio > 1.001 || pc_sc_ratio < 0.999) {
+    LogDetail([pc_sc_ratio](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "PerfClock and system_clock differ by more than 0.1%! "
+         << " pc_sc_ratio: " << pc_sc_ratio;
+      MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
+#else
+      detail.Warning("PerfClock and system_clock differ by more than 0.1\%. ",
+                     "pc_sc_ratio", pc_sc_ratio);
+#endif
+    });
+  }
+
+  sut->ReportLatencyResults(pr.sample_latencies);
+
+  PerformanceSummary perf_summary{sut->Name(), settings, std::move(pr)};
+  LogSummary([perf_summary](AsyncSummary& summary) mutable {
+    perf_summary.LogSummary(summary);
+  });
+  // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+  PerformanceSummary perf_summary_detail{perf_summary};
+  LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+    perf_summary_detail.LogDetail(detail);
+  });
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+}
+
+/// \brief Runs the binary search mode, templated by scenario.
+/// \details 1. Check whether lower bound from user satisfies the performance
+/// constraints, 2. Find an upper bound using the function 'FindBoundaries'
+/// based on the lower bound, 3. Find peak performance settings using the
+/// function 'FindPeakPerformanceBinarySearch'. note: Since we can't find a
+/// lower bound programmatically because of the monotonicity issue of Server
+/// scenario, rely on user's settings. After resolving this issue, we can
+/// make the function 'FindBoundaries' find a lower bound as well from some
+/// random initial settings.
+template <TestScenario scenario>
+void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                             const TestSettingsInternal& base_settings,
+                             SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting FindPeakPerformance mode");
+#else
+    detail("Starting FindPeakPerformance mode:");
+#endif
+  });
+
+  if (scenario != TestScenario::MultiStream &&
+      scenario != TestScenario::MultiStreamFree &&
+      scenario != TestScenario::Server) {
+    LogDetail([unsupported_scenario = ToString(scenario)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_config",
+                       find_peak_performance::kNotSupportedMsg);
+#else
+      detail.Error(find_peak_performance::kNotSupportedMsg);
+#endif
+    });
+    return;
+  }
+
+  LogDetail(
+      [base_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           base_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(
+            detail, "generic_message",
+            "FindPeakPerformance: Check validity of the base settings field: " +
+                base_field);
+#else
+        detail(
+            "FindPeakPerformance: Check validity of the base settings field: " +
+            base_field);
+#endif
+      });
+
+  // 1. Check whether the lower bound came from user satisfy performance
+  // constraints or not.
+  std::vector<loadgen::LoadableSampleSet> base_loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, base_settings));
+  const LoadableSampleSet& base_performance_set = base_loadable_sets.front();
+  LoadSamplesToRam(qsl, base_performance_set.set);
+
+  PerformanceResult base_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, base_settings, base_performance_set, sequence_gen));
+  PerformanceSummary base_perf_summary{sut->Name(), base_settings,
+                                       std::move(base_pr)};
+
+  // We can also use all_constraints_met to check performance constraints,
+  // but to reduce searching time, leave it up to whether the settings satisfy
+  // min duration & min queries or not to users.
+  std::string msg;
+  if (!base_perf_summary.PerfConstraintsMet(&msg)) {
+    LogDetail([msg](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "FindPeakPerformance: Initial lower bound does not satisfy "
+         << "performance constraints, msg: " << msg;
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error(
+          "FindPeakPerformance: Initial lower bound does not satisfy "
+          "performance constraints, msg: " +
+          msg);
+#endif
+    });
+
+    sut->ReportLatencyResults(base_perf_summary.pr.sample_latencies);
+
+    PerformanceSummary perf_summary{sut->Name(), base_settings,
+                                    std::move(base_perf_summary.pr)};
+    LogSummary([perf_summary](AsyncSummary& summary) mutable {
+      perf_summary.LogSummary(summary);
+    });
+    // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+    PerformanceSummary perf_summary_detail{perf_summary};
+    LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+      perf_summary_detail.LogDetail(detail);
+    });
+
+    qsl->UnloadSamplesFromRam(base_performance_set.set);
+
+    return;
+  }
+
+  // Clear loaded samples.
+  qsl->UnloadSamplesFromRam(base_performance_set.set);
+
+  // 2. Find an upper bound based on the lower bound.
+  std::pair<PerformanceSummary, PerformanceSummary> boundaries =
+      FindBoundaries<scenario>(sut, qsl, sequence_gen, base_perf_summary);
+  PerformanceSummary l_perf_summary = boundaries.first;
+  PerformanceSummary u_perf_summary = boundaries.second;
+
+  LogDetail(
+      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           l_perf_summary.settings),
+       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           u_perf_summary.settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(detail, "generic_message",
+                   "FindPeakPerformance: Found boundaries: [" + l_field + ", " +
+                       u_field + ")");
+#else
+        detail("FindPeakPerformance: Found boundaries: [" + l_field + ", " +
+               u_field + ")");
+#endif
+      });
+
+  // Reuse performance_set, u_perf_summary has the largest 'samples_per_query'.
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, u_perf_summary.settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  // 3. Find peak performance settings using the found boundaries
+  PerformanceSummary perf_summary = FindPeakPerformanceBinarySearch<scenario>(
+      sut, qsl, sequence_gen, performance_set, l_perf_summary, u_perf_summary);
+
+  // Print-out the peak performance test setting.
+  LogDetail([field = find_peak_performance::ToStringPerformanceField<scenario>(
+                 perf_summary.settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message",
+               "FindPeakPerformance: Found peak performance field: " + field);
+#else
+    detail("FindPeakPerformance: Found peak performance field: " + field);
+#endif
+  });
+
+  sut->ReportLatencyResults(perf_summary.pr.sample_latencies);
+
+  LogSummary([perf_summary](AsyncSummary& summary) mutable {
+    perf_summary.LogSummary(summary);
+  });
+  // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+  PerformanceSummary perf_summary_detail{perf_summary};
+  LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+    perf_summary_detail.LogDetail(detail);
+  });
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+}
+
+/// \brief Runs the accuracy mode, templated by scenario.
+template <TestScenario scenario>
+void RunAccuracyMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                     const TestSettingsInternal& settings,
+                     SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting accuracy mode");
+#else
+    detail("Starting accuracy mode:");
+#endif
+  });
+
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, settings));
+
+  for (auto& loadable_set : loadable_sets) {
+    {
+      auto tracer = MakeScopedTracer(
+          [count = loadable_set.set.size()](AsyncTrace& trace) {
+            trace("LoadSamples", "count", count);
+          });
+      LoadSamplesToRam(qsl, loadable_set.set);
+    }
+
+    PerformanceResult pr(IssueQueries<scenario, TestMode::AccuracyOnly>(
+        sut, settings, loadable_set, sequence_gen));
+
+    {
+      auto tracer = MakeScopedTracer(
+          [count = loadable_set.set.size()](AsyncTrace& trace) {
+            trace("UnloadSampes", "count", count);
+          });
+      qsl->UnloadSamplesFromRam(loadable_set.set);
+    }
+  }
+}
+
+/// \brief Routes runtime scenario requests to the corresponding instances
+/// of its templated mode functions.
+struct RunFunctions {
+  using Signature = void(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                         const TestSettingsInternal& settings,
+                         SequenceGen* sequence_gen);
+
+  template <TestScenario compile_time_scenario>
+  static RunFunctions GetCompileTime() {
+    return {(RunAccuracyMode<compile_time_scenario>),
+            (RunPerformanceMode<compile_time_scenario>),
+            (FindPeakPerformanceMode<compile_time_scenario>)};
+  }
+
+  static RunFunctions Get(TestScenario run_time_scenario) {
+    switch (run_time_scenario) {
+      case TestScenario::SingleStream:
+        return GetCompileTime<TestScenario::SingleStream>();
+      case TestScenario::MultiStream:
+        return GetCompileTime<TestScenario::MultiStream>();
+      case TestScenario::MultiStreamFree:
+        return GetCompileTime<TestScenario::MultiStreamFree>();
+      case TestScenario::Server:
+        return GetCompileTime<TestScenario::Server>();
+      case TestScenario::Offline:
+        return GetCompileTime<TestScenario::Offline>();
+    }
+    // We should not reach this point.
+    assert(false);
+    return GetCompileTime<TestScenario::SingleStream>();
+  }
+
+  Signature& accuracy;
+  Signature& performance;
+  Signature& find_peak_performance;
+};
+
+}  // namespace loadgen
+
+void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+               const TestSettings& requested_settings,
+               const LogSettings& log_settings) {
+  GlobalLogger().StartIOThread();
+
+  const std::string test_date_time = CurrentDateTimeISO8601();
+
+  loadgen::LogOutputs log_outputs(log_settings.log_output, test_date_time);
+  if (!log_outputs.CheckOutputs()) {
+    return;
+  }
+
+  GlobalLogger().StartLogging(&log_outputs.summary_out, &log_outputs.detail_out,
+                              &log_outputs.accuracy_out,
+                              log_settings.log_output.copy_detail_to_stdout,
+                              log_settings.log_output.copy_summary_to_stdout);
+
+  if (log_settings.enable_trace) {
+    GlobalLogger().StartNewTrace(&log_outputs.trace_out, PerfClock::now());
+  }
+
+  LogLoadgenVersion();
+  LogDetail([sut, qsl, test_date_time](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "test_datetime", test_date_time);
+    MLPERF_LOG(detail, "sut_name", sut->Name());
+    MLPERF_LOG(detail, "qsl_name", qsl->Name());
+    MLPERF_LOG(detail, "qsl_reported_total_count", qsl->TotalSampleCount());
+    MLPERF_LOG(detail, "qsl_reported_performance_count",
+               qsl->PerformanceSampleCount());
+#else
+    detail("Date + time of test: ", test_date_time);
+    detail("System Under Test (SUT) name: ", sut->Name());
+    detail("Query Sample Library (QSL) name: ", qsl->Name());
+    detail("QSL total size: ", qsl->TotalSampleCount());
+    detail("QSL performance size*: ", qsl->PerformanceSampleCount());
+    detail("*TestSettings (performance_sample_count_override) can override");
+    detail("*Refer to Effective Settings for actual value");
+#endif
+  });
+
+  TestSettings test_settings = requested_settings;
+  // Look for Audit Config file to override TestSettings during audit
+  const std::string audit_config_filename = "audit.config";
+  if (FileExists(audit_config_filename)) {
+    LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING(detail, "warning_generic_message",
+                         "Found Audit Config file (audit.config)."
+                         " Overriding TestSettings from audit.config file.");
+#else
+      detail(
+          "Found Audit Config file (audit.config)."
+          " Overriding TestSettings from audit.config file.");
+#endif
+    });
+    std::string audit_scenario = loadgen::ToString(test_settings.scenario);
+    // Remove Spaces from the string
+    RemoveValue(&audit_scenario, ' ');
+    const std::string generic_model = "*";
+    test_settings.FromConfig(audit_config_filename, generic_model,
+                             audit_scenario);
+  }
+
+  loadgen::TestSettingsInternal sanitized_settings(
+      test_settings, qsl->PerformanceSampleCount());
+  sanitized_settings.LogAllSettings();
+
+  auto run_funcs = loadgen::RunFunctions::Get(sanitized_settings.scenario);
+
+  loadgen::SequenceGen sequence_gen;
+  switch (sanitized_settings.mode) {
+    case TestMode::SubmissionRun:
+      run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
+      run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::AccuracyOnly:
+      run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::PerformanceOnly:
+      run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::FindPeakPerformance:
+      run_funcs.find_peak_performance(sut, qsl, sanitized_settings,
+                                      &sequence_gen);
+      break;
+  }
+
+  loadgen::IssueQueryController::GetInstance().EndThreads();
+
+  // Stop tracing after logging so all logs are captured in the trace.
+  GlobalLogger().StopLogging();
+  GlobalLogger().StopTracing();
+  GlobalLogger().StopIOThread();
+}
+
+void AbortTest() {
+  loadgen::IssueQueryController::GetInstance().EndThreads();
+  GlobalLogger().StopLogging();
+  GlobalLogger().StopTracing();
+  GlobalLogger().StopIOThread();
+}
+
+void QuerySamplesComplete(QuerySampleResponse* responses,
+                          size_t response_count, const ResponseCallback& response_cb) {
+  PerfClock::time_point timestamp = PerfClock::now();
+
+  auto tracer = MakeScopedTracer(
+      [](AsyncTrace& trace) { trace("QuerySamplesComplete"); });
+
+  const QuerySampleResponse* end = responses + response_count;
+
+  // Notify first to unblock loadgen production ASAP.
+  for (QuerySampleResponse* response = responses; response < end; response++) {
+    loadgen::SampleMetadata* sample =
+        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
+    loadgen::QueryMetadata* query = sample->query_metadata;
+    query->NotifyOneSampleCompleted(timestamp);
+  }
+
+  // Log samples.
+  for (QuerySampleResponse* response = responses; response < end; response++) {
+    loadgen::SampleMetadata* sample =
+        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
+    loadgen::QueryMetadata* query = sample->query_metadata;
+    query->response_delegate->SampleComplete(sample, response, timestamp, response_cb);
+  }
+}
+
+}  // namespace mlperf
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/loadgen.h b/benchmarks/rnnt/ootb/inference/loadgen/loadgen.h
new file mode 100644
index 0000000..cb2f4cb
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/loadgen.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Provides the entry points for a SUT to start a test and respond
+/// to issued queries.
+
+#ifndef MLPERF_LOADGEN_LOADGEN_H_
+#define MLPERF_LOADGEN_LOADGEN_H_
+
+#include <cstddef>
+#include <functional>
+
+/// \brief Contains the loadgen API.
+namespace mlperf {
+
+struct QuerySampleResponse;
+class QuerySampleLibrary;
+class SystemUnderTest;
+struct TestSettings;
+struct LogSettings;
+
+using ResponseCallback = std::function<void(QuerySampleResponse*)>;
+
+/// \addtogroup LoadgenAPI Loadgen API
+/// @{
+
+///
+/// \brief SUT calls this to notify loadgen of completed samples.
+/// \details
+/// * The samples may be from any combination of queries or partial queries as
+///   issued by \link mlperf::SystemUnderTest::IssueQuery
+///
+///   SystemUnderTest::IssueQuery \endlink.
+/// * The SUT is responsible for owning and allocating the reponse data. The
+///   loadgen will copy the response data if needed (e.g. for accuracy mode).
+///   + If no response callback is provided, the response data must remain valid
+///     for the entire duration of this call.
+///   + The response callback is untimed; it is called for each response in
+///     responses after the loadgen records the completion time and before the
+///     loadgen copies the response data. The response callback enables the
+///     loadgen to simulate response data being stored in accelerator DRAM.
+///     After the response callback is called, response data must reside on the
+///     host so that the loadgen can copy it. Submitters must seek prior approval
+///     to use this feature of loadgen (refer to
+///     https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#5-load-generator).
+/// * All calls to QuerySampleComplete are thread-safe and wait-free bounded.
+///   + Any number of threads can call QuerySampleComplete simultaneously.
+///   + Regardless of where any other thread stalls, the current thread will
+///     finish QuerySampleComplete in a bounded number of cycles.
+///   + Note: If a callback is provided, the SUT must ensure that the callback
+///     is also thread-safe and wait-free bounded for the above to hold.
+void QuerySamplesComplete(QuerySampleResponse* responses,
+                          size_t response_count, const ResponseCallback& response_cb = {});
+
+///
+/// \brief Starts the test against SUT with the specified settings.
+/// \details This is the C++ entry point. See mlperf::c::StartTest for the
+/// C entry point.
+///
+void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+               const TestSettings& requested_settings,
+               const LogSettings& log_settings);
+
+///
+/// \brief Aborts the running test.
+/// \details This function will stop issueing new samples to the SUT. StartTest
+/// will return after the current inference finishes. Since StartTest is a
+/// blocking function, this function can only be called in another thread.
+void AbortTest();
+
+///
+/// \brief Register a thread for query issuing in Server scenario.
+/// \details If a thread registers itself, the thread(s) is used to call SUT's
+/// IssueQuery(). This function is blocking until the entire test is done. The
+/// number of registered threads must match server_num_issue_query_threads in
+/// TestSettings. This function only has effect in Server scenario.
+/// This is the C++ entry point. See mlperf::c::RegisterIssueQueryThread for the
+/// C entry point.
+///
+void RegisterIssueQueryThread();
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_LOADGEN_H_
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/loadgen_integration_diagram.svg b/benchmarks/rnnt/ootb/inference/loadgen/loadgen_integration_diagram.svg
new file mode 100644
index 0000000..17dd1b4
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/loadgen_integration_diagram.svg
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created by diasvg.py -->
+<svg width="29.100cm" height="12.100cm" viewBox="1.950 0.950 29.100 12.100"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+<!-- Layer: Background -->
+<g id="Background">
+<!-- Flowchart - Box -->
+<rect x="2.000" y="2.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="6.000" y="3.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Model + Dataset</text>
+<!-- Flowchart - Box -->
+<rect x="2.000" y="6.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="6.000" y="7.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Pre Processor</text>
+<!-- Flowchart - Box -->
+<rect x="24.000" y="6.000" width="7.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="27.500" y="7.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Post Processor</text>
+<!-- Flowchart - Box -->
+<rect x="13.000" y="1.000" width="8.000" height="8.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="17.000" y="5.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Benchmark</text>
+<!-- Flowchart - Box -->
+<rect x="24.000" y="1.000" width="7.000" height="4.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="27.500" y="3.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Backend</text>
+<!-- Flowchart - Box -->
+<rect x="13.000" y="11.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="17.000" y="12.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+LoadGen</text>
+<!-- Standard - Line -->
+<line x1="10.000" y1="3.000" x2="12.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="12.888,3.000 12.388,3.250 12.513,3.000 12.388,2.750 "/>
+<!-- Standard - Line -->
+<line x1="10.000" y1="7.000" x2="12.513" y2="7.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="12.888,7.000 12.388,7.250 12.513,7.000 12.388,6.750 "/>
+<!-- Standard - Line -->
+<line x1="15.000" y1="9.000" x2="15.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="15.000,10.888 14.750,10.388 15.000,10.513 15.250,10.388 "/>
+<!-- Standard - Line -->
+<line x1="17.000" y1="11.000" x2="17.000" y2="9.487" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="17.000,9.112 17.250,9.612 17.000,9.487 16.750,9.612 "/>
+<!-- Standard - Line -->
+<line x1="19.000" y1="9.000" x2="19.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="19.000,10.888 18.750,10.388 19.000,10.513 19.250,10.388 "/>
+<!-- Standard - Line -->
+<line x1="21.000" y1="3.000" x2="23.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="23.888,3.000 23.388,3.250 23.513,3.000 23.388,2.750 "/>
+<!-- Standard - Line -->
+<line x1="24.000" y1="7.000" x2="21.487" y2="7.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="21.112,7.000 21.612,6.750 21.487,7.000 21.612,7.250 "/>
+<!-- Standard - Text -->
+<text x="10.850" y="2.650"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+1</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<text x="14.100" y="9.950"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+2</text>
+<!-- Standard - Text -->
+<text x="16.100" y="10.450"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+3</text>
+<!-- Standard - Text -->
+<text x="18.100" y="9.950"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+5</text>
+<!-- Standard - Text -->
+<text x="21.950" y="2.650"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+4</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<text x="24.500" y="12.221"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+LoadGen Logs</text>
+<!-- Standard - Line -->
+<line x1="21.000" y1="12.000" x2="23.513" y2="12.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="23.888,12.000 23.388,12.250 23.513,12.000 23.388,11.750 "/>
+<!-- Standard - Text -->
+<text x="21.900" y="11.450"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+6</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<!-- Standard - Line -->
+<line x1="27.500" y1="5.000" x2="27.500" y2="5.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="27.500,5.888 27.250,5.388 27.500,5.513 27.750,5.388 "/>
+</g>
+</svg>
\ No newline at end of file
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/logging.cc b/benchmarks/rnnt/ootb/inference/loadgen/logging.cc
new file mode 100644
index 0000000..3341747
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/logging.cc
@@ -0,0 +1,1096 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Implements a logging system with a central IO thread that handles
+/// all stringification and IO.
+/// \details Log-producing threads only submit lambdas to be executed on the
+/// IO thread.
+/// All producers and consumers use lock-free operations that guarantee
+/// forward progress independent of a) other stalled threads and b) where
+/// those threads are stalled.
+/// Each thread uses a double-buffering scheme to queue its logs. One buffer
+/// is always reserved for writes and the other is reserved for reads.
+/// A producing thread sends requests to the IOThread to swap the buffers
+/// and the IOThread does the actual read/write swap after it has finished
+/// reading the buffer it was working on.
+
+#include "logging.h"
+
+#include <cassert>
+#include <cmath>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <process.h>
+#include <windows.h>
+#define MLPERF_GET_PID() _getpid()
+#else
+#include <unistd.h>
+#define MLPERF_GET_PID() getpid()
+#endif
+
+// Use system-level TID for tracing. This enables correlation with other
+// performance tools that are not aware of C++ std::thread::id.
+#if defined(__linux__)
+#include <sys/syscall.h>
+#define MLPERF_GET_TID() syscall(SYS_gettid)
+#elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+#define MLPERF_GET_TID() GetCurrentThreadId()
+#elif defined(__APPLE__)
+#define MLPERF_GET_TID() \
+  std::hash<std::thread::id>{}(std::this_thread::get_id())
+#else
+// TODO: std::this_thread::id is a class but MLPERF_GET_TID() assigned to
+// uint64_t
+#define MLPERF_GET_TID() std::this_thread::get_id()
+#endif
+
+#include "utils.h"
+
+namespace mlperf {
+namespace logging {
+
+namespace {
+
+uintptr_t SwapRequestSlotIsWritableValue(size_t id) {
+  // LSB of 1 indicates that this isn't a pointer.
+  // MSBs encode the id to detect collisions when a slot in
+  // |thread_swap_request_slots_| is reused for a different id and the request
+  // for the previous id is very slow.
+  return (id << 1) | 0x1;
+}
+
+bool SwapRequestSlotIsReadable(uintptr_t value) {
+  // Valid pointers will not have their lsb set.
+  return (value & 0x1) != 0x1;
+}
+
+constexpr size_t kMaxThreadsToLog = 1024;
+constexpr std::chrono::milliseconds kLogPollPeriod(10);
+
+/// \brief How many log entries to pre-allocate per thread to help avoid
+/// runtime allocation.
+constexpr size_t kTlsLogReservedEntryCount = 1024;
+
+constexpr auto kInvalidLatency = std::numeric_limits<QuerySampleLatency>::min();
+
+}  // namespace
+
+const std::string& ArgValueTransform(const bool& value) {
+  static const std::string v_true("true");
+  static const std::string v_false("false");
+  return value ? v_true : v_false;
+}
+
+char Bin2Hex(uint8_t four_bits) {
+  char number = '0' + four_bits;
+  char letter = ('A' - 10) + four_bits;
+  return four_bits < 10 ? number : letter;
+}
+
+const std::string ArgValueTransform(const LogBinaryAsHexString& value) {
+  if (value.data == nullptr) {
+    return "\"\"";
+  }
+  std::string hex;
+  hex.reserve(value.data->size() + 2);
+  hex.push_back('"');
+  for (auto b : *value.data) {
+    hex.push_back(Bin2Hex(b >> 4));
+    hex.push_back(Bin2Hex(b & 0x0F));
+  }
+  hex.push_back('"');
+  return hex;
+}
+
+#if USE_NEW_LOGGING_FORMAT
+const std::string ArgValueTransform(const std::string& value) {
+  return std::string("\"") + value + std::string("\"");
+}
+
+const std::string ArgValueTransform(const char* value) {
+  return std::string("\"") + std::string(value) + std::string("\"");
+}
+
+const std::string ArgValueTransform(const std::vector<size_t>& value) {
+  std::string s("[");
+  for (auto i : value) {
+    s += std::to_string(i) + ",";
+  }
+  s.resize(s.size() - 1);
+  s += "]";
+  return s;
+}
+
+const std::string ArgValueTransform(
+    const std::map<std::string, std::string>& value) {
+  std::string s("{");
+  for (const auto& i : value) {
+    s += "\"";
+    s += i.first;
+    s += "\":\"";
+    s += i.second;
+    s += "\",";
+  }
+  s.resize(s.size() - 1);
+  s += "}";
+  return s;
+}
+
+const std::string ArgValueTransform(const float value) {
+  if (value == std::numeric_limits<float>::infinity()) {
+    return "Infinity";
+  }
+  else if (value == -std::numeric_limits<float>::infinity()) {
+    return "-Infinity";
+  }
+  else if (std::isnan(value)) {
+    return "NaN";
+  }
+  return std::to_string(value);
+}
+
+const std::string ArgValueTransform(const double value) {
+  if (value == std::numeric_limits<double>::infinity()) {
+    return "Infinity";
+  }
+  else if (value == -std::numeric_limits<double>::infinity()) {
+    return "-Infinity";
+  }
+  else if (std::isnan(value)) {
+    return "NaN";
+  }
+  return std::to_string(value);
+}
+#endif
+
+ChromeTracer::ChromeTracer(std::ostream* out, PerfClock::time_point origin)
+    : out_(out), origin_(origin) {
+  WriteTraceEventHeader();
+}
+
+ChromeTracer::~ChromeTracer() {
+  WriteTraceEventFooter();
+  out_->flush();
+}
+
+void ChromeTracer::WriteTraceEventHeader() {
+  // Times and durations are converted from nanoseconds to microseconds, use
+  // 3 decimal digits to preserve precision.
+  *out_ << std::fixed << std::setprecision(3) << "{\"traceEvents\":[\n";
+}
+
+void ChromeTracer::WriteTraceEventFooter() {
+  *out_ << "{\"name\":\"LastTrace\"}\n"
+        << "],\n"
+        << "\"displayTimeUnit\":\"ns\",\n"
+        << "\"otherData\":{\n"
+        << "\"ts\":" << Micros(origin_.time_since_epoch()).count() << ",\n"
+        << "\"version\":\"MLPerf LoadGen v1.0\"\n"
+        << "}\n"
+        << "}\n";
+}
+
+void AsyncLog::SetCurrentPidTid(uint64_t pid, uint64_t tid) {
+  current_pid_ = pid;
+  current_tid_ = tid;
+}
+
+void AsyncLog::SetLogFiles(std::ostream* summary, std::ostream* detail,
+                           std::ostream* accuracy, bool copy_detail_to_stdout,
+                           bool copy_summary_to_stdout,
+                           PerfClock::time_point log_origin) {
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  if (summary_out_ != &std::cerr) {
+    std::string warning_summary;
+    if (log_warning_count_ == 0) {
+      warning_summary = "\nNo warnings encountered during test.\n";
+    } else if (log_warning_count_ == 1) {
+      warning_summary = "\n1 warning encountered. See detailed log.\n";
+    } else if (log_warning_count_ != 0) {
+      warning_summary = "\n" + std::to_string(log_warning_count_) +
+                        " warnings encountered. See detailed log.\n";
+    }
+
+    std::string error_summary;
+    if (log_error_count_ == 0) {
+      error_summary = "\nNo errors encountered during test.\n";
+    } else if (log_error_count_ == 1) {
+      error_summary = "\n1 ERROR encountered. See detailed log.\n";
+    } else if (log_error_count_ != 0) {
+      error_summary = "\n" + std::to_string(log_error_count_) +
+                      " ERRORS encountered. See detailed log.\n";
+    }
+
+    *summary_out_ << warning_summary << error_summary;
+    if (copy_summary_to_stdout_) {
+      std::cout << warning_summary << error_summary;
+    }
+  }
+  if (summary_out_) {
+    summary_out_->flush();
+  }
+  if (detail_out_) {
+    detail_out_->flush();
+  }
+  if (accuracy_out_ != &std::cerr) {
+    WriteAccuracyFooterLocked();
+    accuracy_out_->flush();
+  }
+  summary_out_ = summary;
+  detail_out_ = detail;
+  accuracy_out_ = accuracy;
+  if (accuracy_out_ != &std::cerr) {
+    WriteAccuracyHeaderLocked();
+  }
+  copy_detail_to_stdout_ = copy_detail_to_stdout;
+  copy_summary_to_stdout_ = copy_summary_to_stdout;
+  log_origin_ = log_origin;
+  log_error_count_ = 0;
+  log_warning_count_ = 0;
+}
+
+void AsyncLog::StartNewTrace(std::ostream* trace_out,
+                             PerfClock::time_point origin) {
+  std::unique_lock<std::mutex> lock(trace_mutex_);
+  if (trace_out) {
+    tracer_ = std::make_unique<ChromeTracer>(trace_out, origin);
+  } else {
+    tracer_.reset();
+  }
+}
+
+void AsyncLog::StopTrace() {
+  std::unique_lock<std::mutex> lock(trace_mutex_);
+  tracer_.reset();
+}
+
+void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
+                           const LogBinaryAsHexString& response) {
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  if (!accuracy_out_) {
+    return;
+  }
+  *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
+  LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+          response);
+  *accuracy_out_ << " }";
+  accuracy_needs_comma_ = true;
+}
+
+void AsyncLog::Flush() {
+  {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    if (summary_out_) {
+      summary_out_->flush();
+    }
+    if (detail_out_) {
+      detail_out_->flush();
+    }
+    if (accuracy_out_) {
+      accuracy_out_->flush();
+    }
+  }
+
+  {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->Flush();
+    }
+  }
+}
+
+void AsyncLog::WriteAccuracyHeaderLocked() {
+  *accuracy_out_ << "[";
+  accuracy_needs_comma_ = false;
+}
+
+void AsyncLog::WriteAccuracyFooterLocked() { *accuracy_out_ << "\n]\n"; }
+
+void AsyncLog::RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                                       size_t latencies_to_reserve) {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+  assert(latencies_.empty());
+  assert(latencies_recorded_ == latencies_expected_);
+  latencies_recorded_ = 0;
+  latencies_expected_ = 0;
+  max_latency_ = 0;
+  max_completion_timstamp_ = PerfClock::now();
+  latencies_first_sample_sequence_id_ = first_sample_sequence_id;
+  latencies_.reserve(latencies_to_reserve);
+}
+
+void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
+                                      PerfClock::time_point completion_time,
+                                      QuerySampleLatency latency) {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+
+  max_latency_ = std::max(max_latency_, latency);
+
+  max_completion_timstamp_ =
+      std::max(max_completion_timstamp_, completion_time);
+
+  if (sample_sequence_id < latencies_first_sample_sequence_id_) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Received completion for an old sample."
+       << " Min expected id: " << latencies_first_sample_sequence_id_
+       << " Actual id: " << sample_sequence_id;
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync(
+        "Received completion for an old sample.", "Min expected id",
+        latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
+#endif
+    return;
+  }
+
+  const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
+
+  if (latencies_.size() <= i) {
+    // TODO: Reserve in advance.
+    latencies_.resize(i + 1, kInvalidLatency);
+  } else if (latencies_[i] != kInvalidLatency) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                          "Attempted to complete a sample twice.");
+#else
+    GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+    // Return without recording the latency again to avoid potentially
+    // ending the test before the SUT is actually done, which could result
+    // in a segfault.
+    // If the SUT recorded the wrong sample, the test will hang and see
+    // the error above.
+    return;
+  }
+
+  latencies_[i] = latency;
+  latencies_recorded_++;
+  if (AllLatenciesRecorded()) {
+    all_latencies_recorded_.notify_all();
+  }
+}
+
+std::vector<QuerySampleLatency> AsyncLog::GetLatenciesBlocking(
+    size_t expected_count) {
+  std::vector<QuerySampleLatency> latencies;
+  {
+    std::unique_lock<std::mutex> lock(latencies_mutex_);
+    latencies_expected_ = expected_count;
+    all_latencies_recorded_.wait(lock, [&] { return AllLatenciesRecorded(); });
+    latencies.swap(latencies_);
+  }
+
+  if (latencies.size() != expected_count) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Received SequenceId that was too large."
+       << " expected_size: " << expected_count
+       << " actual_size: " << latencies.size();
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync("Received SequenceId that was too large.",
+                                "expected_size", expected_count, "actual_size",
+                                latencies.size());
+#endif
+  }
+
+  size_t invalid_latency_count = 0;
+  for (auto l : latencies) {
+    if (l == kInvalidLatency) {
+      invalid_latency_count++;
+    }
+  }
+  if (invalid_latency_count != 0) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Encountered incomplete samples at the end of a series of queries."
+       << " count: " << invalid_latency_count;
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync(
+        "Encountered incomplete samples at the end of a series of queries.",
+        "count", invalid_latency_count);
+#endif
+  }
+
+  return latencies;
+}
+
+PerfClock::time_point AsyncLog::GetMaxCompletionTime() {
+  return max_completion_timstamp_;
+}
+
+QuerySampleLatency AsyncLog::GetMaxLatencySoFar() {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+  return max_latency_;
+}
+
+/// \brief Records a single thread using thread-local storage and submits
+/// entries to the central Logger.
+///
+/// \details This setup allows for each log entry to be added:
+///   * With forward-progress guarantees. (i.e.: no locking or blocking
+///       operations even if other threads have stalled.)
+///   * Without expensive syscalls or I/O operations, which are deferred to
+///       the central Logger.
+class TlsLogger {
+ public:
+  TlsLogger(std::function<void()> forced_detatch);
+  ~TlsLogger();
+  void ForcedDetatchFromThread() { forced_detatch_(); }
+
+  void Log(AsyncLogEntry&& entry);
+  void SwapBuffers();
+
+  std::vector<AsyncLogEntry>* StartReadingEntries();
+  void FinishReadingEntries();
+  bool ReadBufferHasBeenConsumed();
+  size_t MaxEntryVectorSize() { return max_entry_size_; }
+
+  uint64_t Pid() const { return pid_; }
+  uint64_t Tid() const { return tid_; }
+
+  void RequestSwapBuffersSlotRetried() {
+    swap_buffers_slot_retry_count_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  size_t ReportLogCasFailCount() {
+    size_t c = log_cas_fail_count_.load(std::memory_order_relaxed);
+    log_cas_fail_count_.fetch_sub(c, std::memory_order_relaxed);
+    return c;
+  }
+
+  size_t ReportSwapBuffersSlotRetryCount() {
+    size_t c = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed);
+    swap_buffers_slot_retry_count_.fetch_sub(c, std::memory_order_relaxed);
+    return c;
+  }
+
+  void TraceCounters();
+
+ private:
+  using EntryVector = std::vector<AsyncLogEntry>;
+  enum class EntryState { Unlocked, ReadLock, WriteLock };
+
+  // Accessed by producer only.
+  size_t i_read_ = 0;
+
+  // Accessed by producer and consumer atomically.
+  EntryVector entries_[2];
+  std::atomic<EntryState> entry_states_[2]{{EntryState::ReadLock},
+                                           {EntryState::Unlocked}};
+  std::atomic<size_t> i_write_{1};
+
+  std::atomic<size_t> log_cas_fail_count_{0};
+  std::atomic<size_t> swap_buffers_slot_retry_count_{0};
+
+  // Accessed by consumer only.
+  size_t unread_swaps_ = 0;
+  size_t i_write_prev_ = 0;
+  uint64_t pid_;
+  uint64_t tid_;
+  size_t max_entry_size_ = kTlsLogReservedEntryCount;
+
+  std::function<void()> forced_detatch_;
+};
+
+Logger::Logger(std::chrono::duration<double> poll_period,
+               size_t max_threads_to_log)
+    : poll_period_(poll_period),
+      max_threads_to_log_(max_threads_to_log),
+      thread_swap_request_slots_(max_threads_to_log * 2) {
+  const size_t kSlotCount = max_threads_to_log * 2;
+  for (size_t i = 0; i < kSlotCount; i++) {
+    std::atomic_init(&thread_swap_request_slots_[i],
+                     SwapRequestSlotIsWritableValue(i));
+  }
+}
+
+Logger::~Logger() {
+  // TlsLoggers might outlive this Logger when loaded as a python module.
+  // Forcefully make all currently registered TlsLoggers orphans.
+  std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+  TlsLogger* tls_logger_prev = nullptr;
+  (void)tls_logger_prev;  // Avoid unused error in release builds.
+  while (!tls_loggers_registerd_.empty()) {
+    TlsLogger* tls_logger = *tls_loggers_registerd_.begin();
+    // Otherwise, this is an infinite loop.
+    assert(tls_logger != tls_logger_prev);
+    tls_loggers_registerd_mutex_.unlock();
+    tls_logger->ForcedDetatchFromThread();
+    tls_loggers_registerd_mutex_.lock();
+    tls_logger_prev = tls_logger;
+  }
+}
+
+void Logger::RequestSwapBuffers(TlsLogger* tls_logger) {
+  auto tls_logger_as_uint = reinterpret_cast<uintptr_t>(tls_logger);
+  assert(SwapRequestSlotIsReadable(tls_logger_as_uint));
+  size_t id, slot;
+  uintptr_t slot_is_writeable_value;
+  // The compare_exchange below should almost always succeed.
+  // The compare_exchange may fail if a recycled slot is still actively used
+  // by another thread, so we retry with subsequent slots here if needed.
+  // Since the slot count is 2x the expected number of threads to log,
+  // the CAS should only fail at most 50% of the time when all logging threads
+  // happen to be descheduled between the fetch_add and CAS below, which is
+  // very unlikely.
+  id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
+  slot = id % thread_swap_request_slots_.size();
+  slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
+  while (!thread_swap_request_slots_[slot].compare_exchange_strong(
+      slot_is_writeable_value, tls_logger_as_uint, std::memory_order_release)) {
+    id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
+    slot = id % thread_swap_request_slots_.size();
+    slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
+    tls_logger->RequestSwapBuffersSlotRetried();
+  }
+}
+
+void Logger::RegisterTlsLogger(TlsLogger* tls_logger) {
+  std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+  if (tls_loggers_registerd_.size() >= max_threads_to_log_) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
+                          "Warning: More TLS loggers registerd than can be "
+                          "active simultaneously.");
+#else
+    LogErrorSync(
+        "Warning: More TLS loggers registerd than can "
+        "be active simultaneously.\n");
+#endif
+  }
+  tls_loggers_registerd_.insert(tls_logger);
+}
+
+// This moves ownership of the tls_logger data to Logger so the
+// exiting thread can exit immediately, even if all the logs of the
+// exiting thread haven't been processed.
+void Logger::UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger) {
+  OrphanContainer::iterator orphan;
+  {
+    std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+    tls_logger_orphans_.emplace_front(std::move(tls_logger));
+    orphan = tls_logger_orphans_.begin();
+  }
+
+  // Only remove the TlsLogger from the registry after adding to orphans so
+  // CollectTlsLoggerStats doesn't have any gaps in coverage.
+  {
+    std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+    tls_loggers_registerd_.erase(orphan->get());
+  }
+
+  // This will flush the logs of |tls_logger| and mark it for destruction.
+  // Deferring destruction via orphans_to_destroy helps avoid use-after-frees
+  // when the IOThread calls FinishReadingEntries.
+  (*orphan)->Log([this, orphan](AsyncLog&) {
+    CollectTlsLoggerStats(orphan->get());
+    orphans_to_destroy_.push_back(orphan);
+  });
+}
+
+void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
+  tls_total_log_cas_fail_count_ += tls_logger->ReportLogCasFailCount();
+  tls_total_swap_buffers_slot_retry_count_ +=
+      tls_logger->ReportSwapBuffersSlotRetryCount();
+
+  size_t max_entry_vector_size = tls_logger->MaxEntryVectorSize();
+  if (max_entry_vector_size > kTlsLogReservedEntryCount) {
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream msg;
+    msg << "Logging allocation detected:"
+        << " tid: " << tls_logger->Tid()
+        << " reserved_entries: " << kTlsLogReservedEntryCount
+        << " max_entries: " << max_entry_vector_size;
+    MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
+#else
+    async_logger_.FlagWarning();
+    async_logger_.LogDetail("Logging allocation detected: ", "tid",
+                            tls_logger->Tid(), "reserved_entries",
+                            kTlsLogReservedEntryCount, "max_entries",
+                            max_entry_vector_size);
+#endif
+  }
+}
+
+void Logger::StartIOThread() {
+  {
+    std::unique_lock<std::mutex> lock(io_thread_mutex_);
+    keep_io_thread_alive_ = true;
+  }
+  io_thread_ = std::thread(&Logger::IOThread, this);
+}
+
+void Logger::StopIOThread() {
+  {
+    std::unique_lock<std::mutex> lock(io_thread_mutex_);
+    keep_io_thread_alive_ = false;
+    io_thread_cv_.notify_all();
+  }
+  io_thread_.join();
+}
+
+void Logger::StartLogging(std::ostream* summary, std::ostream* detail,
+                          std::ostream* accuracy, bool copy_detail_to_stdout,
+                          bool copy_summary_to_stdout) {
+  async_logger_.SetLogFiles(summary, detail, accuracy, copy_detail_to_stdout,
+                            copy_summary_to_stdout, PerfClock::now());
+}
+
+void Logger::StopLogging() {
+  if (std::this_thread::get_id() == io_thread_.get_id()) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
+                          "StopLogging() not supported from IO thread.");
+#else
+    LogErrorSync("StopLogging() not supported from IO thread.");
+#endif
+    return;
+  }
+
+  // Flush logs from this thread.
+  std::promise<void> io_thread_flushed_this_thread;
+  Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
+  io_thread_flushed_this_thread.get_future().wait();
+  async_logger_.SetLogFiles(&std::cerr, &std::cerr, &std::cerr, false, false,
+                            PerfClock::now());
+}
+
+void Logger::StartNewTrace(std::ostream* trace_out,
+                           PerfClock::time_point origin) {
+  async_logger_.StartNewTrace(trace_out, origin);
+}
+
+void Logger::StopTracing() {
+  // Flush traces from this thread.
+  std::promise<void> io_thread_flushed_this_thread;
+  Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
+  io_thread_flushed_this_thread.get_future().wait();
+  async_logger_.StopTrace();
+}
+
+void Logger::LogContentionAndAllocations() {
+  LogDetail([&](AsyncDetail& detail) {
+    {
+      std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+      for (auto tls_logger : tls_loggers_registerd_) {
+        CollectTlsLoggerStats(tls_logger);
+      }
+    }
+
+    {
+      std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+      for (auto& orphan : tls_logger_orphans_) {
+        CollectTlsLoggerStats(orphan.get());
+      }
+    }
+
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_count",
+               swap_request_slots_retry_count_);
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_retry_count",
+               swap_request_slots_retry_retry_count_);
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_reencounter_count",
+               swap_request_slots_retry_reencounter_count_);
+    MLPERF_LOG(detail, "logger_start_reading_entries_retry_count",
+               start_reading_entries_retry_count_);
+    MLPERF_LOG(detail, "logger_tls_total_log_cas_fail_count",
+               tls_total_log_cas_fail_count_);
+    MLPERF_LOG(detail, "logger_tls_total_swap_buffers_slot_retry_count",
+               tls_total_swap_buffers_slot_retry_count_);
+#else
+    detail("Log Contention Counters:");
+    detail(std::to_string(swap_request_slots_retry_count_) +
+           " : swap_request_slots_retry_count");
+    detail(std::to_string(swap_request_slots_retry_retry_count_) +
+           " : swap_request_slots_retry_retry_count");
+    detail(std::to_string(swap_request_slots_retry_reencounter_count_) +
+           " : swap_request_slots_retry_reencounter_count");
+    detail(std::to_string(start_reading_entries_retry_count_) +
+           " : start_reading_entries_retry_count");
+    detail(std::to_string(tls_total_log_cas_fail_count_) +
+           " : tls_total_log_cas_fail_count");
+    detail(std::to_string(tls_total_swap_buffers_slot_retry_count_) +
+           " : tls_total_swap_buffers_slot_retry_count");
+#endif
+
+    swap_request_slots_retry_count_ = 0;
+    swap_request_slots_retry_retry_count_ = 0;
+    swap_request_slots_retry_reencounter_count_ = 0;
+    start_reading_entries_retry_count_ = 0;
+    tls_total_log_cas_fail_count_ = 0;
+    tls_total_swap_buffers_slot_retry_count_ = 0;
+  });
+}
+
+void Logger::RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                                     size_t latencies_to_reserve) {
+  async_logger_.RestartLatencyRecording(first_sample_sequence_id,
+                                        latencies_to_reserve);
+}
+
+std::vector<QuerySampleLatency> Logger::GetLatenciesBlocking(
+    size_t expected_count) {
+  return async_logger_.GetLatenciesBlocking(expected_count);
+}
+
+PerfClock::time_point Logger::GetMaxCompletionTime() {
+  return async_logger_.GetMaxCompletionTime();
+}
+
+QuerySampleLatency Logger::GetMaxLatencySoFar() {
+  return async_logger_.GetMaxLatencySoFar();
+}
+
+TlsLogger* Logger::GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id) {
+  uintptr_t slot_value = thread_swap_request_slots_[slot].load();
+  if (SwapRequestSlotIsReadable(slot_value)) {
+    // TODO: Convert this block to a simple write once we are confidient
+    // that we don't need to check for success.
+    bool success = thread_swap_request_slots_[slot].compare_exchange_strong(
+        slot_value, SwapRequestSlotIsWritableValue(next_id));
+    if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING((*this), "warning_generic_message", "CAS failed.");
+#else
+      LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+      assert(success);
+    }
+    return reinterpret_cast<TlsLogger*>(slot_value);
+  }
+  return nullptr;
+}
+
+void Logger::GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
+  if (swap_request_slots_to_retry_.empty()) {
+    return;
+  }
+
+  std::vector<SlotRetry> retry_slots;
+  retry_slots.swap(swap_request_slots_to_retry_);
+  for (auto& slot_retry : retry_slots) {
+    TlsLogger* tls_logger =
+        GetTlsLoggerThatRequestedSwap(slot_retry.slot, slot_retry.next_id);
+    if (tls_logger) {
+      threads_to_swap->push_back(tls_logger);
+    } else {
+      swap_request_slots_to_retry_.push_back(slot_retry);
+      swap_request_slots_retry_retry_count_++;
+    }
+  }
+}
+
+void Logger::GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
+  auto swap_request_end = swap_request_id_.load(std::memory_order_acquire);
+  for (; swap_request_id_read_ < swap_request_end; swap_request_id_read_++) {
+    size_t slot = swap_request_id_read_ % thread_swap_request_slots_.size();
+    size_t next_id = swap_request_id_read_ + thread_swap_request_slots_.size();
+    TlsLogger* tls_logger = GetTlsLoggerThatRequestedSwap(slot, next_id);
+    if (tls_logger) {
+      threads_to_swap->push_back(tls_logger);
+    } else {
+      swap_request_slots_retry_count_++;
+      // A thread is in the middle of its call to RequestSwapBuffers.
+      // Retry later once it's done.
+      auto it = std::find_if(swap_request_slots_to_retry_.begin(),
+                             swap_request_slots_to_retry_.end(),
+                             [=](SlotRetry& s) { return s.slot == slot; });
+      if (it == swap_request_slots_to_retry_.end()) {
+        // This is the first time we are retrying the slot.
+        swap_request_slots_to_retry_.push_back({slot, next_id});
+      } else {
+        // Whoa. We've been retrying this slot since the last time it was
+        // encountered. Just update the next_id.
+        it->next_id = next_id;
+        swap_request_slots_retry_reencounter_count_++;
+      }
+    }
+  }
+}
+
+void Logger::IOThread() {
+  while (keep_io_thread_alive_) {
+    auto tracer1 =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("IOThreadLoop"); });
+    {
+      auto tracer2 = MakeScopedTracer([](AsyncTrace& trace) { trace("Wait"); });
+      std::unique_lock<std::mutex> lock(io_thread_mutex_);
+      io_thread_cv_.wait_for(lock, poll_period_,
+                             [&] { return !keep_io_thread_alive_; });
+    }
+
+    {
+      auto tracer3 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Gather"); });
+      std::vector<TlsLogger*> threads_to_swap;
+      threads_to_swap.swap(threads_to_swap_deferred_);
+      GatherRetrySwapRequests(&threads_to_swap);
+      GatherNewSwapRequests(&threads_to_swap);
+      for (TlsLogger* thread : threads_to_swap) {
+        if (thread->ReadBufferHasBeenConsumed()) {
+          thread->SwapBuffers();
+          // After swapping a thread, it's ready to be read.
+          threads_to_read_.push_back(thread);
+        } else {
+          // Don't swap buffers again until we've finish reading the
+          // previous swap.
+          threads_to_swap_deferred_.push_back(thread);
+        }
+      }
+    }
+
+    {
+      auto tracer4 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Process"); });
+      // Read from the threads we are confident have activity.
+      for (std::vector<TlsLogger*>::iterator thread = threads_to_read_.begin();
+           thread != threads_to_read_.end(); thread++) {
+        auto tracer5 =
+            MakeScopedTracer([tid = (*thread)->Tid()](AsyncTrace& trace) {
+              trace("Thread", "tid", tid);
+            });
+        std::vector<AsyncLogEntry>* entries = (*thread)->StartReadingEntries();
+        if (!entries) {
+          start_reading_entries_retry_count_++;
+          continue;
+        }
+
+        async_logger_.SetCurrentPidTid((*thread)->Pid(), (*thread)->Tid());
+        for (auto& entry : *entries) {
+          // Execute the entry to perform the serialization and I/O.
+          entry(async_logger_);
+        }
+        (*thread)->FinishReadingEntries();
+        // Mark for removal by the call to RemoveValue below.
+        *thread = nullptr;
+      }
+
+      // Only remove threads where reading succeeded so we retry the failed
+      // threads the next time around.
+      RemoveValue(&threads_to_read_, nullptr);
+    }
+
+    // Explicitly flush every time we wake up. The goal being minimization
+    // of large implicit flushes which could affect tail latency measurements,
+    // especially at percentiles closer to 100%.
+    /// \todo Determine if explicitly flushing logs every wake up is better
+    /// than relying on implicit flushing.
+    {
+      auto tracer6 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("FlushAll"); });
+      async_logger_.Flush();
+    }
+
+    if (!orphans_to_destroy_.empty()) {
+      auto tracer7 = MakeScopedTracer(
+          [](AsyncTrace& trace) { trace("Abandoning Orphans"); });
+      std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+      for (auto orphan : orphans_to_destroy_) {
+        tls_logger_orphans_.erase(orphan);
+      }
+      orphans_to_destroy_.clear();
+    }
+  }
+}
+
+TlsLogger::TlsLogger(std::function<void()> forced_detatch)
+    : pid_(MLPERF_GET_PID()),
+      tid_(MLPERF_GET_TID()),
+      forced_detatch_(std::move(forced_detatch)) {
+  for (auto& entry : entries_) {
+    entry.reserve(kTlsLogReservedEntryCount);
+  }
+}
+
+TlsLogger::~TlsLogger() {}
+
+// Log always makes forward progress since it can unconditionally obtain a
+// "lock" on at least one of the buffers for writing.
+// Notificiation is also lock free.
+void TlsLogger::Log(AsyncLogEntry&& entry) {
+  size_t cas_fail_count = 0;
+  auto unlocked = EntryState::Unlocked;
+  size_t i_write = i_write_.load(std::memory_order_relaxed);
+  while (!entry_states_[i_write].compare_exchange_strong(
+      unlocked, EntryState::WriteLock, std::memory_order_acquire,
+      std::memory_order_relaxed)) {
+    unlocked = EntryState::Unlocked;
+    i_write ^= 1;
+    // We may need to try 3 times, since there could be a race with a
+    // previous SwapBuffers request and we use memory_order_relaxed when
+    // loading i_write_ above.
+    cas_fail_count++;
+    if (cas_fail_count >= 3) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                         "CAS failed.");
+#else
+      GlobalLogger().LogErrorSync("CAS failed.", "times", cas_fail_count,
+                                  "line", __LINE__);
+#endif
+    }
+    log_cas_fail_count_.fetch_add(1, std::memory_order_relaxed);
+  }
+  entries_[i_write].emplace_back(std::forward<AsyncLogEntry>(entry));
+
+  // TODO: Convert this block to a simple write once we are confidient
+  // that we don't need to check for success.
+  auto write_lock = EntryState::WriteLock;
+  bool success = entry_states_[i_write].compare_exchange_strong(
+      write_lock, EntryState::Unlocked, std::memory_order_release);
+  if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                       "CAS failed.");
+#else
+    GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+    assert(success);
+  }
+
+  bool write_buffer_swapped = i_write_prev_ != i_write;
+  if (write_buffer_swapped) {
+    GlobalLogger().RequestSwapBuffers(this);
+    i_write_prev_ = i_write;
+  }
+}
+
+void TlsLogger::SwapBuffers() {
+  // TODO: Convert this block to a simple write once we are confidient
+  // that we don't need to check for success.
+  auto read_lock = EntryState::ReadLock;
+  bool success = entry_states_[i_read_].compare_exchange_strong(
+      read_lock, EntryState::Unlocked, std::memory_order_release);
+  if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                       "CAS failed.");
+#else
+    GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+    assert(success);
+  }
+
+  i_write_.store(i_read_, std::memory_order_relaxed);
+  i_read_ ^= 1;
+  unread_swaps_++;
+}
+
+// Returns nullptr if read lock fails.
+std::vector<AsyncLogEntry>* TlsLogger::StartReadingEntries() {
+  auto unlocked = EntryState::Unlocked;
+  if (entry_states_[i_read_].compare_exchange_strong(
+          unlocked, EntryState::ReadLock, std::memory_order_acquire,
+          std::memory_order_relaxed)) {
+    return &entries_[i_read_];
+  }
+  return nullptr;
+}
+
+void TlsLogger::FinishReadingEntries() {
+  // Detect first logging allocation and track max allocated size.
+  size_t new_size = entries_[i_read_].size();
+  if (new_size > max_entry_size_) {
+    if (max_entry_size_ == kTlsLogReservedEntryCount) {
+      Log([ts = PerfClock::now()](AsyncLog& log) {
+        log.TraceAsyncInstant("FirstAllocation", 0, ts);
+      });
+    }
+    max_entry_size_ = new_size;
+  }
+
+  entries_[i_read_].clear();
+  unread_swaps_--;
+}
+
+bool TlsLogger::ReadBufferHasBeenConsumed() { return unread_swaps_ == 0; }
+
+void TlsLogger::TraceCounters() {
+  auto tracer = MakeScopedTracer(
+      [lcfc = log_cas_fail_count_.load(std::memory_order_relaxed),
+       sbsrc = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed)](
+          AsyncTrace& trace) {
+        trace("TlsLogger:ContentionCounters", "log_cas_fail_count", lcfc,
+              "swap_buffers_slot_retry_count", sbsrc);
+      });
+}
+
+Logger& GlobalLogger() {
+  static Logger g_logger(kLogPollPeriod, kMaxThreadsToLog);
+  return g_logger;
+}
+
+/// \brief Moves ownership of the TlsLogger to Logger on thread exit
+/// so no round-trip synchronization with the IO thread is required.
+struct TlsLoggerWrapper {
+  TlsLoggerWrapper(std::function<void()> forced_detatch)
+      : tls_logger(std::make_unique<TlsLogger>(std::move(forced_detatch))) {
+    GlobalLogger().RegisterTlsLogger(tls_logger.get());
+  }
+  ~TlsLoggerWrapper() {
+    tls_logger->TraceCounters();
+    GlobalLogger().UnRegisterTlsLogger(std::move(tls_logger));
+  }
+  std::unique_ptr<TlsLogger> tls_logger;
+};
+
+TlsLoggerWrapper* InitializeMyTlsLoggerWrapper() {
+  thread_local std::unique_ptr<TlsLoggerWrapper> tls_logger_wrapper;
+  // forced_detatch lets the global Logger forcefully detatch TlsLoggers
+  // from the thread in the Logger's destructor, which may run before
+  // thread-local variables are destroyed when the loadgen is used as a python
+  // module and dynamically unloaded.
+  // Note: We capture a pointer to the tls_logger_wrapper since variables of
+  // the thread-local storage class aren't actually captured. C++ spec says
+  // only variables of the automatic storage class are captured.
+  /// \todo There is a race where the same TlsLoggerWrapper might be
+  /// destroyed both naturally and via forced_detatch. Destruction of
+  /// the TlsLoggerWrapper should be locked.
+  auto forced_detatch = [tls_logger_wrapper = &tls_logger_wrapper]() {
+    tls_logger_wrapper->reset();
+  };
+  tls_logger_wrapper = std::make_unique<TlsLoggerWrapper>(forced_detatch);
+  return tls_logger_wrapper.get();
+}
+
+TlsLogger* InitializeMyTlsLogger() {
+  thread_local TlsLoggerWrapper* wrapper = InitializeMyTlsLoggerWrapper();
+  return wrapper->tls_logger.get();
+}
+
+void Log(AsyncLogEntry&& entry) {
+  thread_local TlsLogger* const tls_logger = InitializeMyTlsLogger();
+  tls_logger->Log(std::forward<AsyncLogEntry>(entry));
+}
+
+}  // namespace logging
+}  // namespace mlperf
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/logging.h b/benchmarks/rnnt/ootb/inference/loadgen/logging.h
new file mode 100644
index 0000000..23b70f1
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/logging.h
@@ -0,0 +1,808 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Internal logging implementation details.
+
+#ifndef MLPERF_LOADGEN_LOGGING_H_
+#define MLPERF_LOADGEN_LOGGING_H_
+
+#define USE_NEW_LOGGING_FORMAT 1
+#define MLPERF_LOG(logger, key, value) \
+  logger.Log((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_ERROR(logger, key, value) \
+  logger.LogError((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_ERROR_SYNC(logger, key, value) \
+  logger.LogErrorSync((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_WARNING(logger, key, value) \
+  logger.LogWarning((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_INTERVAL_START(logger, key, value) \
+  logger.LogIntervalStart((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_INTERVAL_END(logger, key, value) \
+  logger.LogIntervalEnd((key), (value), __FILE__, __LINE__)
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <map>
+#include <mutex>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \brief Wait-free logging utilities that defer stringification
+/// and syscalls to a worker thread.
+namespace logging {
+
+class AsyncLog;
+class Logger;
+class TlsLogger;
+struct TlsLoggerWrapper;
+
+/// \todo Verify lambas are not allocating when bounded to a std::function.
+using AsyncLogEntry = std::function<void(AsyncLog&)>;
+using PerfClock = std::chrono::high_resolution_clock;
+
+/// \brief Logs the raw bytes as a hexadecimal ascii string.
+struct LogBinaryAsHexString {
+  std::vector<uint8_t>* data;
+};
+
+/// \brief By default, print out the value directly.
+template <typename T>
+const T& ArgValueTransform(const T& value) {
+  return value;
+}
+
+/// \brief Print out True/False.
+const std::string& ArgValueTransform(const bool& value);
+/// \brief Print out binary day as hex string.
+const std::string ArgValueTransform(const LogBinaryAsHexString& value);
+#if USE_NEW_LOGGING_FORMAT
+/// \brief Print out a string in JSON format (with quotes).
+const std::string ArgValueTransform(const std::string& value);
+const std::string ArgValueTransform(const char* value);
+/// \brief Prints a list of int in JSON format.
+const std::string ArgValueTransform(const std::vector<size_t>& value);
+/// \brief Prints a dict in JSON format.
+const std::string ArgValueTransform(
+    const std::map<std::string, std::string>& value);
+#endif
+
+/// \brief Helper to print out values without quotes when value is a string.
+template <typename T>
+const T& ArgValueTransformWithoutQuote(const T& value) {
+  return ArgValueTransform<T>(value);
+}
+inline const std::string ArgValueTransformWithoutQuote(
+    const LogBinaryAsHexString& value) {
+  return ArgValueTransform(value);
+}
+/// \brief Helper to print out a string without the quotes.
+inline const std::string ArgValueTransformWithoutQuote(
+    const std::string& value) {
+  return value;
+}
+
+/// \brief Outputs a trace that can be uploaded to chrome://tracing for
+/// visualization.
+/// \details Trace event format definition:
+/// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit?usp=sharing
+class ChromeTracer {
+ public:
+  ChromeTracer(std::ostream* trace_out, PerfClock::time_point origin);
+  ~ChromeTracer();
+
+  template <typename... Args>
+  void AddCompleteEvent(const std::string& name, uint64_t pid, uint64_t tid,
+                        PerfClock::time_point start, PerfClock::time_point end,
+                        const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"ph\":\"X\","
+          << "\"pid\":" << pid << ","
+          << "\"tid\":" << tid << ","
+          << "\"ts\":" << Micros(start - origin_).count() << ","
+          << "\"dur\":" << Micros(end - start).count() << ","
+          << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncBeginEvent(const std::string& name, uint64_t pid, uint64_t id,
+                          PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"cat\":\"default\","
+          << "\"ph\":\"b\","
+          << "\"pid\":" << pid << ","
+          << "\"id\":" << id << ","
+          << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncInstantEvent(const std::string& name, uint64_t pid, uint64_t id,
+                            PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"cat\":\"default\","
+          << "\"ph\":\"n\","
+          << "\"pid\":" << pid << ","
+          << "\"id\":" << id << ","
+          << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncEndEvent(const std::string& name, uint64_t pid, uint64_t id,
+                        PerfClock::time_point time) {
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"cat\":\"default\","
+          << "\"ph\":\"e\", "
+          << "\"pid\":" << pid << ","
+          << "\"id\":" << id << ","
+          << "\"ts\":" << Micros(time - origin_).count() << "},\n";
+  }
+
+  template <typename... Args>
+  void AddCounterEvent(const std::string& name, uint64_t pid,
+                       PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"ph\": \"C\","
+          << "\"pid\":" << pid << ","
+          << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{ ";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  void Flush() { out_->flush(); }
+
+ private:
+  using Micros = std::chrono::duration<double, std::micro>;
+
+  void WriteTraceEventHeader();
+  void WriteTraceEventFooter();
+
+  void AddArgs() {}
+
+  template <typename T>
+  void AddArgs(const std::string& arg_name, const T& arg_value) {
+    *out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value);
+  }
+
+  template <typename T, typename... Args>
+  void AddArgs(const std::string& arg_name, const T& arg_value,
+               const Args... args) {
+    *out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value) << ",";
+    AddArgs(args...);
+  }
+
+  std::ostream* out_;
+  PerfClock::time_point origin_;
+};
+
+/// \brief The proxy all logging lambdas ultimately use to write any log type.
+/// \details Passed as an argument to the log lambda on the
+/// recording thread to serialize the data captured by the lambda and
+/// forward it to the output stream.
+/// \todo Make summary_out_, detail_out_, accuracy_out_, and trace_out_
+/// instances of a new LogOutput interface that the client may override.
+class AsyncLog {
+ public:
+  void SetLogFiles(std::ostream* summary, std::ostream* detail,
+                   std::ostream* accuracy, bool copy_detail_to_stdout,
+                   bool copy_summary_to_stdout,
+                   PerfClock::time_point log_origin);
+  void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
+  void StopTrace();
+  void Flush();
+
+  void SetCurrentPidTid(uint64_t pid, uint64_t tid);
+
+  void LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
+                   const LogBinaryAsHexString& response);
+
+  template <typename... Args>
+  void LogSummary(const std::string& message, const Args... args);
+
+  void SetLogDetailTime(PerfClock::time_point time) { log_detail_time_ = time; }
+
+  void FlagError() {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    log_error_count_++;
+    error_flagged_ = true;
+  }
+
+  void FlagWarning() {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    log_warning_count_++;
+    warning_flagged_ = true;
+  }
+
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  void LogDetail(const std::string& key, const T& value,
+                 const std::string file_name, const unsigned int line_no);
+#else
+  template <typename... Args>
+  void LogDetail(const std::string& message, const Args... args);
+#endif
+
+  template <typename... Args>
+  void Trace(const std::string& trace_name, PerfClock::time_point start,
+             PerfClock::time_point end, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_, start,
+                                end, args...);
+    }
+  }
+
+  template <typename... Args>
+  void TraceAsyncInstant(const std::string& trace_name, uint64_t id,
+                         PerfClock::time_point instant_time,
+                         const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddAsyncInstantEvent(trace_name, current_pid_, id, instant_time,
+                                    args...);
+    }
+  }
+
+  void SetScopedTraceTimes(PerfClock::time_point start,
+                           PerfClock::time_point end) {
+    scoped_start_ = start;
+    scoped_end_ = end;
+  }
+
+  template <typename... Args>
+  void ScopedTrace(const std::string& trace_name, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_,
+                                scoped_start_, scoped_end_, args...);
+    }
+  }
+
+  template <typename... Args>
+  void TraceSample(const std::string& trace_name, uint64_t id,
+                   PerfClock::time_point start, PerfClock::time_point end,
+                   const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddAsyncBeginEvent(trace_name, current_pid_, id, start, args...);
+      tracer_->AddAsyncEndEvent(trace_name, current_pid_, id, end);
+    }
+  }
+
+  template <typename... Args>
+  void TraceCounterEvent(const std::string& trace_name,
+                         PerfClock::time_point time, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCounterEvent(trace_name, current_pid_, time, args...);
+    }
+  }
+
+  void RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                               size_t latencies_to_reserve);
+  void RecordSampleCompletion(uint64_t sample_sequence_id,
+                              PerfClock::time_point completion_time,
+                              QuerySampleLatency latency);
+  std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
+  PerfClock::time_point GetMaxCompletionTime();
+  QuerySampleLatency GetMaxLatencySoFar();
+
+ private:
+  void WriteAccuracyHeaderLocked();
+  void WriteAccuracyFooterLocked();
+
+  void LogArgs(std::ostream*) {}
+
+  template <typename T>
+  void LogArgs(std::ostream* out, const T& value_only) {
+    *out << ArgValueTransformWithoutQuote(value_only);
+  }
+
+  template <typename T>
+  void LogArgs(std::ostream* out, const std::string& arg_name,
+               const T& arg_value) {
+    *out << "\"" << arg_name
+         << "\" : " << ArgValueTransformWithoutQuote(arg_value);
+  }
+
+  template <typename T, typename... Args>
+  void LogArgs(std::ostream* out, const std::string& arg_name,
+               const T& arg_value, const Args... args) {
+    *out << "\"" << arg_name
+         << "\" : " << ArgValueTransformWithoutQuote(arg_value) << ", ";
+    LogArgs(out, args...);
+  }
+
+  std::mutex log_mutex_;
+  std::ostream* summary_out_ = &std::cerr;
+  std::ostream* detail_out_ = &std::cerr;
+  std::ostream* accuracy_out_ = &std::cerr;
+  // TODO: Instead of these bools, use a class that forwards to two streams.
+  bool copy_detail_to_stdout_ = false;
+  bool copy_summary_to_stdout_ = false;
+  bool accuracy_needs_comma_ = false;
+  PerfClock::time_point log_origin_;
+  size_t log_error_count_ = 0;
+  bool error_flagged_ = false;
+  size_t log_warning_count_ = 0;
+  bool warning_flagged_ = false;
+
+  std::mutex trace_mutex_;
+  std::unique_ptr<ChromeTracer> tracer_;
+
+  uint64_t current_pid_;
+  uint64_t current_tid_;
+  PerfClock::time_point log_detail_time_;
+  PerfClock::time_point scoped_start_;
+  PerfClock::time_point scoped_end_;
+
+  std::mutex latencies_mutex_;
+  std::condition_variable all_latencies_recorded_;
+  uint64_t latencies_first_sample_sequence_id_ = 0;
+  std::vector<QuerySampleLatency> latencies_;
+  QuerySampleLatency max_latency_ = 0;
+  PerfClock::time_point max_completion_timstamp_;
+  size_t latencies_recorded_ = 0;
+  size_t latencies_expected_ = 0;
+  // Must be called with latencies_mutex_ held.
+  bool AllLatenciesRecorded() {
+    return latencies_recorded_ == latencies_expected_;
+  }
+};
+
+/// \brief The central logger that logs all threads belonging to a run.
+class Logger {
+ public:
+  Logger(std::chrono::duration<double> poll_period, size_t max_threads_to_log);
+  ~Logger();
+
+  void StartIOThread();
+  void StopIOThread();
+
+  void StartLogging(std::ostream* summary, std::ostream* detail,
+                    std::ostream* accuracy, bool copy_detail_to_stdout,
+                    bool copy_summary_to_stdout);
+  void StopLogging();
+
+  void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
+  void StopTracing();
+
+  void LogContentionAndAllocations();
+
+  void RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                               size_t latencies_to_reserve);
+  std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
+  PerfClock::time_point GetMaxCompletionTime();
+  QuerySampleLatency GetMaxLatencySoFar();
+
+ private:
+  friend AsyncLog;
+  friend TlsLogger;
+  friend TlsLoggerWrapper;
+
+  void RegisterTlsLogger(TlsLogger* tls_logger);
+  void UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger);
+  void RequestSwapBuffers(TlsLogger* tls_logger);
+  void CollectTlsLoggerStats(TlsLogger* tls_logger);
+
+  TlsLogger* GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id);
+  void GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap);
+  void GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap);
+
+  /// \brief The main logging thread function that handles the serialization
+  /// and I/O to the stream or file.
+  ///
+  /// \todo Provide client hook to set logging thead affinity and priority.
+  void IOThread();
+
+// Slow synchronous error logging for internals that may prevent
+// async logging from working.
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  void LogErrorSync(const std::string& key, const T& value,
+                    const std::string file_name, const unsigned int line_no) {
+    /// \todo Acquire mutex once for FlagError + LogDetail to avoid
+    ///       races. Better yet, switch to a non-stateful error API.
+    //       This is better than nothing though.
+    async_logger_.FlagError();
+    async_logger_.LogDetail(key, value, file_name, line_no);
+  }
+  template <typename T>
+  void LogWarning(const std::string& key, const T& value,
+                  const std::string file_name, const unsigned int line_no) {
+    async_logger_.FlagWarning();
+    async_logger_.LogDetail(key, value, file_name, line_no);
+  }
+#else
+  template <typename... Args>
+  void LogErrorSync(const std::string& message, Args&&... args) {
+    /// \todo Acquire mutex once for FlagError + LogDetail to avoid
+    ///       races. Better yet, switch to a non-stateful error API.
+    //       This is better than nothing though.
+    async_logger_.FlagError();
+    async_logger_.LogDetail(message, std::forward<Args>(args)...);
+  }
+#endif
+
+  // Accessed by IOThead only.
+  const std::chrono::duration<double> poll_period_;
+  AsyncLog async_logger_;
+
+  const size_t max_threads_to_log_;
+  std::thread io_thread_;
+
+  // Accessed by producers and IOThead during thread registration and
+  // destruction. Protected by io_thread_mutex_.
+  std::mutex io_thread_mutex_;
+  std::condition_variable io_thread_cv_;
+  bool keep_io_thread_alive_ = false;
+
+  std::mutex tls_loggers_registerd_mutex_;
+  std::unordered_set<TlsLogger*> tls_loggers_registerd_;
+
+  // Temporarily stores TlsLogger data for threads that have exited until
+  // all their log entries have been processed.
+  // Accessed by IOThread and producers as their threads exit.
+  std::mutex tls_logger_orphans_mutex_;
+  using OrphanContainer = std::list<std::unique_ptr<TlsLogger>>;
+  OrphanContainer tls_logger_orphans_;
+
+  // Accessed by producers and IOThead atomically.
+  std::atomic<size_t> swap_request_id_{0};
+  std::vector<std::atomic<uintptr_t>> thread_swap_request_slots_;
+
+  // Accessed by IOThead only.
+  size_t swap_request_id_read_{0};
+  struct SlotRetry {
+    size_t slot;
+    uintptr_t next_id;
+  };
+  std::vector<SlotRetry> swap_request_slots_to_retry_;
+  std::vector<TlsLogger*> threads_to_swap_deferred_;
+  std::vector<TlsLogger*> threads_to_read_;
+  std::vector<OrphanContainer::iterator> orphans_to_destroy_;
+
+  // Counts for retries related to the lock-free scheme.
+  // Abnormally high counts could be an indicator of contention.
+  // Access on IOThread only.
+  size_t swap_request_slots_retry_count_ = 0;
+  size_t swap_request_slots_retry_retry_count_ = 0;
+  size_t swap_request_slots_retry_reencounter_count_ = 0;
+  size_t start_reading_entries_retry_count_ = 0;
+  size_t tls_total_log_cas_fail_count_ = 0;
+  size_t tls_total_swap_buffers_slot_retry_count_ = 0;
+};
+
+Logger& GlobalLogger();
+
+/// \brief The generic way to add a log entry.
+/// \details Supports all types of logs, which is useful for complex
+/// lambdas that may wish to log in multiple places or log something other
+/// than a simple summary, detail, or trace entry.
+void Log(AsyncLogEntry&& entry);
+
+/// \brief The convenience proxy a LogSummary lambda uses to write to the
+/// summary log.
+class AsyncSummary {
+ public:
+  explicit AsyncSummary(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.LogSummary(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief A helper to simplify adding a summary log entry.
+template <typename LambdaT>
+void LogSummary(LambdaT&& lambda) {
+  Log([lambda = std::forward<LambdaT>(lambda)](AsyncLog& log) mutable {
+    AsyncSummary async_summary(log);
+    lambda(async_summary);
+  });
+}
+
+/// \brief The convenience proxy a LogDetail lambda uses to write to the detail
+/// log.
+class AsyncDetail {
+ public:
+  explicit AsyncDetail(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  AsyncLog& Log(const std::string& key, const T& value,
+                const std::string file_name, const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogError(const std::string& key, const T& value,
+                     const std::string file_name, const unsigned int line_no) {
+    async_log_.FlagError();
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogWarning(const std::string& key, const T& value,
+                       const std::string file_name,
+                       const unsigned int line_no) {
+    async_log_.FlagWarning();
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogIntervalStart(const std::string& key, const T& value,
+                             const std::string file_name,
+                             const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogIntervalEnd(const std::string& key, const T& value,
+                           const std::string file_name,
+                           const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+#else
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+  template <typename... Args>
+  AsyncLog& Error(Args&&... args) {
+    async_log_.FlagError();
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+  template <typename... Args>
+  AsyncLog& Warning(Args&&... args) {
+    async_log_.FlagWarning();
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+#endif
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief A helper to simplify adding a detail log entry.
+template <typename LambdaT>
+void LogDetail(LambdaT&& lambda) {
+  Log([lambda = std::forward<LambdaT>(lambda),
+       timestamp = PerfClock::now()](AsyncLog& log) mutable {
+    log.SetLogDetailTime(timestamp);
+    AsyncDetail async_detail(log);
+    lambda(async_detail);
+  });
+}
+
+/// \brief The convenience proxy a ScopedTracer lambda uses to write to the
+/// detail log.
+class AsyncTrace {
+ public:
+  explicit AsyncTrace(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.ScopedTrace(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief ScopedTracer is an RAII object that traces the start and end
+/// of its lifetime.
+template <typename LambdaT>
+class ScopedTracer {
+ public:
+  ScopedTracer(LambdaT&& lambda)
+      : start_(PerfClock::now()), lambda_(std::forward<LambdaT>(lambda)) {}
+
+  ~ScopedTracer() {
+    Log([start = start_, lambda = std::move(lambda_),
+         end = PerfClock::now()](AsyncLog& log) {
+      log.SetScopedTraceTimes(start, end);
+      AsyncTrace async_trace(log);
+      lambda(async_trace);
+    });
+  }
+
+ private:
+  PerfClock::time_point start_;
+  LambdaT lambda_;
+};
+
+/// \brief Helper that creates a ScopeTracer with automatic type deduction.
+/// \details Helps with automatic template type deduction, which has been
+/// supported for functions for a long time.
+/// C++17 will support deduction for classes, which will neutralize the utility
+/// of a helper function like this.
+/// \todo Determine which traces to keep for submission purposes.
+template <typename LambdaT>
+auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
+  return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
+}
+
+template <typename... Args>
+void AsyncLog::LogSummary(const std::string& message, const Args... args) {
+  auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
+    std::string sanitized_message = message;
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
+    trace("LogSummary", "message", "\"" + sanitized_message + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  *summary_out_ << message;
+  LogArgs(summary_out_, args...);
+  *summary_out_ << "\n";
+
+  if (copy_summary_to_stdout_) {
+    std::cout << message;
+    LogArgs(&std::cout, args...);
+    std::cout << "\n";
+  }
+}
+
+#if USE_NEW_LOGGING_FORMAT
+template <typename T>
+void AsyncLog::LogDetail(const std::string& key, const T& value,
+                         const std::string file_name,
+                         const unsigned int line_no) {
+  auto tracer = MakeScopedTracer([key](AsyncTrace& trace) {
+    std::string sanitized_key = key;
+    std::replace(sanitized_key.begin(), sanitized_key.end(), '"', '\'');
+    std::replace(sanitized_key.begin(), sanitized_key.end(), '\n', ';');
+    trace("LogDetail", "key", "\"" + sanitized_key + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
+  if (!copy_detail_to_stdout_) {
+    detail_streams.pop_back();
+  }
+  auto time_ns = (log_detail_time_ - log_origin_).count();
+  for (auto os : detail_streams) {
+    *os << ":::MLLOG {"
+        << "\"key\": " << ArgValueTransform(key) << ", "
+        << "\"value\": " << ArgValueTransform(value) << ", "
+        << "\"time_ms\": " << ArgValueTransform(time_ns / 1000000ULL) << "."
+        << std::setfill('0') << std::setw(6)
+        << ArgValueTransform(time_ns % 1000000ULL) << ", "
+        << "\"namespace\": \"mlperf::logging\", "
+        << "\"event_type\": \"POINT_IN_TIME\", "
+        << "\"metadata\": {"
+        << "\"is_error\": " << ArgValueTransform(error_flagged_) << ", "
+        << "\"is_warning\": " << ArgValueTransform(warning_flagged_) << ", "
+        << "\"file\": \"" << file_name << "\", "
+        << "\"line_no\": " << ArgValueTransform(line_no) << ", "
+        << "\"pid\": " << ArgValueTransform(current_pid_) << ", "
+        << "\"tid\": " << ArgValueTransform(current_tid_) << "}}\n";
+    if (error_flagged_) {
+      os->flush();
+    }
+  }
+  error_flagged_ = false;
+  warning_flagged_ = false;
+}
+#else
+template <typename... Args>
+void AsyncLog::LogDetail(const std::string& message, const Args... args) {
+  auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
+    std::string sanitized_message = message;
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
+    trace("LogDetail", "message", "\"" + sanitized_message + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
+  if (!copy_detail_to_stdout_) {
+    detail_streams.pop_back();
+  }
+  for (auto os : detail_streams) {
+    *os << "\"pid\": " << current_pid_ << ", "
+        << "\"tid\": " << current_tid_ << ", "
+        << "\"ts\": " << (log_detail_time_ - log_origin_).count() << "ns : ";
+    if (error_flagged_) {
+      *os << "ERROR : ";
+    } else if (warning_flagged_) {
+      *os << "WARNING : ";
+    }
+    *os << message;
+    LogArgs(os, args...);
+    *os << "\n";
+    if (error_flagged_) {
+      os->flush();
+    }
+  }
+  error_flagged_ = false;
+  warning_flagged_ = false;
+}
+#endif
+
+}  // namespace logging
+
+// Export some things out of the logging namespace to simplify call sites.
+
+const auto GlobalLogger = logging::GlobalLogger;
+const auto Log = logging::Log;
+
+using PerfClock = logging::PerfClock;
+
+using LogBinaryAsHexString = logging::LogBinaryAsHexString;
+
+using AsyncLog = logging::AsyncLog;
+
+using AsyncSummary = logging::AsyncSummary;
+template <typename LambdaT>
+void LogSummary(LambdaT&& lambda) {
+  logging::LogSummary(std::forward<LambdaT>(lambda));
+}
+
+using AsyncDetail = logging::AsyncDetail;
+template <typename LambdaT>
+void LogDetail(LambdaT&& lambda) {
+  logging::LogDetail(std::forward<LambdaT>(lambda));
+}
+
+using AsyncTrace = logging::AsyncTrace;
+
+template <typename LambdaT>
+using ScopedTracer = logging::ScopedTracer<LambdaT>;
+
+template <typename LambdaT>
+auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
+  return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
+}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_LOGGING_H_
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/query_sample.h b/benchmarks/rnnt/ootb/inference/loadgen/query_sample.h
new file mode 100644
index 0000000..9a0bb37
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/query_sample.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the structs involved in issuing a query and responding to
+/// a query.
+/// \details These are broken out into their own files since they are exposed
+/// as part of the C API and we want to avoid C clients including C++ code.
+
+#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_H_
+#define MLPERF_LOADGEN_QUERY_SAMPLE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief Represents a unique identifier for a sample of an issued query.
+/// \details As currently implemented, the id is a pointer to an internal
+/// loadgen struct whose value will never be zero/null.
+typedef uintptr_t ResponseId;
+constexpr ResponseId kResponseIdReserved = 0;
+
+/// \brief An index into the QuerySampleLibrary corresponding to a
+/// single sample.
+typedef size_t QuerySampleIndex;
+
+/// \brief Represents the smallest unit of input inference can run on.
+/// A query consists of one or more samples.
+struct QuerySample {
+  ResponseId id;
+  QuerySampleIndex index;
+};
+
+/// \brief Represents a single response to QuerySample
+struct QuerySampleResponse {
+  ResponseId id;
+  uintptr_t data;
+  size_t size;  ///< Size in bytes.
+};
+
+/// \brief A latency in nanoseconds, as recorded by the loadgen.
+typedef int64_t QuerySampleLatency;
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_QUERY_SAMPLE_H_
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/query_sample_library.h b/benchmarks/rnnt/ootb/inference/loadgen/query_sample_library.h
new file mode 100644
index 0000000..85df96f
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/query_sample_library.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the QuerySampleLibrary interface.
+
+#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
+#define MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief The interface a client implements to coordinate with the loadgen
+/// which samples should be loaded.
+class QuerySampleLibrary {
+ public:
+  virtual ~QuerySampleLibrary() {}
+
+  /// \brief A human readable name for the model.
+  virtual const std::string& Name() const = 0;
+
+  /// \brief Total number of samples in library.
+  virtual size_t TotalSampleCount() = 0;
+
+  /// \brief The number of samples that are guaranteed to fit in RAM.
+  virtual size_t PerformanceSampleCount() = 0;
+
+  /// \brief Loads the requested query samples into memory.
+  /// \details Paired with calls to UnloadSamplesFromRam.
+  /// In the MultiStream scenarios:
+  ///   * Samples will appear more than once.
+  ///   * SystemUnderTest::IssueQuery will only be called with a set of samples
+  ///     that are neighbors in the vector of samples here, which helps
+  ///     SUTs that need the queries to be contiguous.
+  /// In all other scenarios:
+  ///   * A previously loaded sample will not be loaded again.
+  virtual void LoadSamplesToRam(
+      const std::vector<QuerySampleIndex>& samples) = 0;
+
+  /// \brief Unloads the requested query samples from memory.
+  /// \details In the MultiStream scenarios:
+  ///   * Samples may be unloaded the same number of times they were loaded;
+  ///     however, if the implementation de-dups loaded samples rather than
+  ///     loading samples into contiguous memory, it may unload a sample the
+  ///     first time they see it unloaded without a refcounting scheme, ignoring
+  ///     subsequent unloads. A refcounting scheme would also work, but is not
+  ///     a requirement.
+  /// In all other scenarios:
+  ///   * A previously unloaded sample will not be unloaded again.
+  virtual void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) = 0;
+};
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/setup.py b/benchmarks/rnnt/ootb/inference/loadgen/setup.py
new file mode 100644
index 0000000..a4a88d9
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/setup.py
@@ -0,0 +1,79 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+## \file
+#  \brief MLPerf Inference LoadGen python module setup.
+#  \details Creates a module that python can import.
+#  All source files are compiled by python"s C++ toolchain  without depending
+#  on a loadgen lib.
+#
+#  This setup.py can be used stand-alone, without the use of an external
+#  build system. This will polute your source tree with output files
+#  and binaries. Use one of the gn build targets instead if you want
+#  to avoid poluting the source tree.
+
+from setuptools import Extension
+from setuptools import setup
+from version_generator import generate_loadgen_version_definitions
+
+generated_version_source_filename = "generated/version_generated.cc"
+generate_loadgen_version_definitions(generated_version_source_filename, ".")
+
+public_headers = [
+    "loadgen.h",
+    "query_sample.h",
+    "query_sample_library.h",
+    "system_under_test.h",
+    "test_settings.h",
+]
+
+lib_headers = [
+    "logging.h",
+    "test_settings_internal.h",
+    "trace_generator.h",
+    "utils.h",
+    "version.h",
+]
+
+lib_sources = [
+    "issue_query_controller.cc",
+    "loadgen.cc",
+    "logging.cc",
+    "test_settings_internal.cc",
+    "utils.cc",
+    "version.cc",
+]
+
+lib_bindings = [
+    "bindings/python_api.cc",
+]
+
+mlperf_loadgen_headers = public_headers + lib_headers
+mlperf_loadgen_sources_no_gen = lib_sources + lib_bindings
+mlperf_loadgen_sources = (mlperf_loadgen_sources_no_gen +
+                          [generated_version_source_filename])
+
+mlperf_loadgen_module = Extension(
+        "mlperf_loadgen",
+        define_macros=[("MAJOR_VERSION", "1"), ("MINOR_VERSION", "1")],
+        include_dirs=[".", "../third_party/pybind/include"],
+        sources=mlperf_loadgen_sources,
+        depends=mlperf_loadgen_headers)
+
+setup(name="mlperf_loadgen",
+      version="1.1",
+      description="MLPerf Inference LoadGen python bindings",
+      url="https://mlperf.org",
+      ext_modules=[mlperf_loadgen_module])
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/system_under_test.h b/benchmarks/rnnt/ootb/inference/loadgen/system_under_test.h
new file mode 100644
index 0000000..4b98e06
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/system_under_test.h
@@ -0,0 +1,72 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the SystemUnderTest interface.
+
+#ifndef MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
+#define MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
+
+#include <string>
+#include <vector>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief The interface a client implements for the loadgen to test.
+/// \todo Add hook for an untimed warm up period for the SUT.
+/// \todo Add hook for an untimed warm up period for the loadgen logic.
+/// \todo Support power hooks for cool-down period before runing performance
+/// traffic.
+/// \todo Support power hooks for correlating test timeline with power
+/// measurment timeline.
+class SystemUnderTest {
+ public:
+  virtual ~SystemUnderTest() {}
+
+  /// \brief A human-readable string for logging purposes.
+  virtual const std::string& Name() const = 0;
+
+  /// \brief Lets the loadgen issue N samples to the SUT.
+  /// \details The SUT may either a) return immediately and signal completion
+  /// at a later time on another thread or b) it may block and signal
+  /// completion on the current stack. The load generator will handle both
+  /// cases properly.
+  /// Note: The data for neighboring samples may or may not be contiguous
+  /// depending on the scenario.
+  virtual void IssueQuery(const std::vector<QuerySample>& samples) = 0;
+
+  /// \brief Called immediately after the last call to IssueQuery
+  /// in a series is made.
+  /// \details This doesn't necessarily signify the end of the
+  /// test since there may be multiple series involved during a test; for
+  /// example in accuracy mode.
+  /// Clients can use this to flush any deferred queries immediately, rather
+  /// than waiting for some timeout.
+  /// This is especially useful in the server scenario.
+  virtual void FlushQueries() = 0;
+
+  /// \brief Reports the raw latency results to the SUT of each sample issued as
+  /// recorded by the load generator. Units are nanoseconds.
+  virtual void ReportLatencyResults(
+      const std::vector<QuerySampleLatency>& latencies_ns) = 0;
+};
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/test_settings.h b/benchmarks/rnnt/ootb/inference/loadgen/test_settings.h
new file mode 100644
index 0000000..d656d1a
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/test_settings.h
@@ -0,0 +1,352 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Provides ways for a client to change the behavior and
+/// constraints of the load generator.
+/// \details Note: The MLPerf specification takes precedent over any of the
+/// comments in this file if there are inconsistencies in regards to how the
+/// loadgen *should* work.
+/// The comments in this file are indicative of the loadgen implementation.
+
+#ifndef MLPERF_LOADGEN_TEST_SETTINGS_H
+#define MLPERF_LOADGEN_TEST_SETTINGS_H
+
+#include <cstdint>
+#include <string>
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \addtogroup LoadgenAPITestSettings Test Settings
+/// \brief This page contains a description of all the scenarios, modes,
+/// and log settings as implemented by the LoadGen.
+/// @{
+
+///
+/// \enum TestScenario
+/// * **SingleStream**
+///  + Issues queries containing a single sample.
+///  + The next query is only issued once the previous one has completed.
+///  + Internal LoadGen latency between queries is not included in the
+///    latency results.
+///  + **Final performance result is:** a percentile of the latency.
+/// * **MultiStream**
+///  + Attempts to issue queries containing N samples each at a uniform rate.
+///   - N is specified by \link
+///   mlperf::TestSettings::multi_stream_samples_per_query
+///   multi_stream_samples_per_query \endlink.
+///   - The rate is specified by \link
+///   mlperf::TestSettings::multi_stream_target_qps multi_stream_target_qps
+///   \endlink.
+///  + The loadgen will skip sending for one interval if the SUT falls behind
+///    too much.
+///  + By default, only a single query may be outstanding at a time.
+///  + The samples of each query are guaranteed to be contiguous with respect
+///    to the order they were loaded in the QuerySampleLibrary.
+///  + Latency is tracked and reported on a per-query and per-sample basis.
+///  + The latency of a query is the maximum latency of its samples, including
+///    any cross-thread communication within the loadgen.
+///     - If the loadgen has to skip producing for an interval because it
+///       couldn't detect that all samples were completed in time, then the
+///       query will not be considered meeting the latency constraint.
+///     - This is fair since the loadgen skipping production will reduce
+///       pressure on the SUT and should be reflected negatively in the
+///       latency percentiles.
+///     - The last query is special cased since there isn't a subsequent query
+///       to delay. For the last query, the query latency without cross-thread
+///       communication is used.
+///  + **Final performance result is:** PASS if a percentile of the qer-query
+///    latencies is under a given threshold. FAIL otherwise.
+///   - The latency constraint is specified by the function (
+///     \link mlperf::TestSettings::multi_stream_max_async_queries
+///     multi_stream_max_async_queries \endlink /
+///     \link mlperf::TestSettings::multi_stream_target_qps
+///     multi_stream_target_qps \endlink).
+/// * **MultiStreamFree**
+///  + Behaves similar to MultiStream, with the exceptions that it:
+///   - Allows up to N async queries where N is limited only by the latency
+///     target.
+///   - Issues queries at a variable rate corresponding to when the N'th
+///     oldest query completes.
+///  + Not an official MLPerf scenario, but is maintained for evaluation
+///    and testing purposes.
+///  + Compared to MultiStream, there is no frequency quantization, which
+///    allows the results to reflect small performance improvements.
+///  + **Final performance result is:** PASS if a percentile of the per-query
+///    latencies is under a given threhsold. FAIL otherwise.
+///   - The latency constraint is specified by
+///     \link mlperf::TestSettings::multi_stream_target_latency_ns
+///     multi_stream_target_latency_ns \endlink.
+/// * **Server**
+///  + Sends queries with a single sample.
+///  + Queries have a random poisson (non-uniform) arrival rate that, when
+///    averaged, hits the target QPS.
+///  + There is no limit on the number of outstanding queries, as long as
+///    the latency constraints are met.
+///  + **Final performance result is:** PASS if the a percentile of the latency
+///    is under a given threshold. FAIL otherwise.
+///   - Threshold is specified by \link
+///   mlperf::TestSettings::server_target_latency_ns server_target_latency_ns
+///   \endlink.
+/// * **Offline**
+///  + Sends all N samples to the SUT inside of a single query.
+///  + The samples of the query are guaranteed to be contiguous with respect
+///    to the order they were loaded in the QuerySampleLibrary.
+///  + **Final performance result is:** samples per second.
+///
+enum class TestScenario {
+  SingleStream,
+  MultiStream,
+  MultiStreamFree,
+  Server,
+  Offline,
+};
+
+///
+/// \enum TestMode
+/// * **SubmissionRun**
+///  + Runs accuracy mode followed by performance mode.
+///  + TODO: Implement further requirements as decided by MLPerf.
+/// * **AccuracyOnly**
+///  + Runs each sample from the QSL through the SUT a least once.
+///  + Outputs responses to an accuracy json that can be parsed by a model +
+///    sample library specific script.
+/// * **PerformanceOnly**
+///  + Runs the performance traffic for the given scenario, as described in
+///    the comments for TestScenario.
+/// * **FindPeakPerformance**
+///  + Determines the maximumum QPS for the Server scenario.
+///  + Determines the maximum samples per query for the MultiStream and
+///    MultiStreamFree scenarios.
+///  + Not applicable for SingleStream or Offline scenarios.
+///
+enum class TestMode {
+  SubmissionRun,
+  AccuracyOnly,
+  PerformanceOnly,
+  FindPeakPerformance,
+};
+
+///
+/// \brief Top-level struct specifing the modes and parameters of the test.
+///
+struct TestSettings {
+  TestScenario scenario = TestScenario::SingleStream;
+  TestMode mode = TestMode::PerformanceOnly;
+
+  // ==================================
+  /// \name SingleStream-specific
+  /**@{*/
+  /// \brief A hint used by the loadgen to pre-generate enough samples to
+  ///        meet the minimum test duration.
+  uint64_t single_stream_expected_latency_ns = 1000000;
+  /// \brief The latency percentile reported as the final result.
+  double single_stream_target_latency_percentile = 0.90;
+  /**@}*/
+
+  // ==================================
+  /// \name MultiStream-specific
+  /**@{*/
+  /// \brief The uniform rate at which queries are produced.
+  /// The latency constraint for the MultiStream scenario is equal to
+  /// (multi_stream_max_async_queries / multi_stream_target_qps).
+  /// This does not apply to the MultiStreamFree scenario,
+  /// except as a hint for how many queries to pre-generate.
+  double multi_stream_target_qps = 10.0;
+  /// \brief The latency constraint for the MultiStreamFree scenario.
+  /// Does not apply to the MultiStream scenario, whose target latency
+  /// is a function of the QPS and max_async_queries.
+  uint64_t multi_stream_target_latency_ns = 100000000;
+  /// \brief The latency percentile for multistream mode.
+  double multi_stream_target_latency_percentile = 0.9;
+  /// \brief The number of samples in each query.
+  /// \details note: This field is used as a FindPeakPerformance's lower bound.
+  /// When you run FindPeakPerformanceMode, you should make sure that this value
+  /// satisfies performance constraints.
+  int multi_stream_samples_per_query = 4;
+  /// \brief The maximum number of queries, to which a SUT has not responded,
+  /// before the loadgen will throttle issuance of new queries.
+  int multi_stream_max_async_queries = 1;
+  /**@}*/
+
+  // ==================================
+  /// \name Server-specific
+  /**@{*/
+  /// \brief The average QPS of the poisson distribution.
+  /// \details note: This field is used as a FindPeakPerformance's lower bound.
+  /// When you run FindPeakPerformanceMode, you should make sure that this value
+  /// satisfies performance constraints.
+  double server_target_qps = 1;
+  /// \brief The latency constraint for the Server scenario.
+  uint64_t server_target_latency_ns = 100000000;
+  /// \brief The latency percentile for server mode. This value is combined with
+  /// server_target_latency_ns to determine if a run is valid.
+  /// \details 99% is the default value, which is correct for image models. GNMT
+  /// should be set to 0.97 (97%) in v0.5.(As always, check the policy page for
+  /// updated values for the benchmark you are running.)
+  double server_target_latency_percentile = 0.99;
+  /// \brief If this flag is set to true, LoadGen will combine samples from
+  /// multiple queries into a single query if their scheduled issue times have
+  /// passed.
+  bool server_coalesce_queries = false;
+  /// \brief The decimal places of QPS precision used to terminate
+  /// FindPeakPerformance mode.
+  int server_find_peak_qps_decimals_of_precision = 1;
+  /// \brief A step size (as a fraction of the QPS) used to widen the lower and
+  /// upper bounds to find the initial boundaries of binary search.
+  double server_find_peak_qps_boundary_step_size = 1;
+  /// \brief The maximum number of outstanding queries to allow before earlying
+  /// out from a performance run. Useful for performance tuning and speeding up
+  /// the FindPeakPerformance mode.
+  uint64_t server_max_async_queries = 0;  ///< 0: Infinity.
+  /// \brief The number of issue query threads that will be registered and used
+  /// to call SUT's IssueQuery(). If this is 0, the same thread calling
+  /// StartTest() will be used to call IssueQuery(). See also
+  /// mlperf::RegisterIssueQueryThread().
+  uint64_t server_num_issue_query_threads = 0;
+  /**@}*/
+
+  // ==================================
+  /// \name Offline-specific
+  /**@{*/
+  /// \brief Specifies the QPS the SUT expects to hit for the offline load.
+  /// The loadgen generates 10% more queries than it thinks it needs to meet
+  /// the minimum test duration.
+  double offline_expected_qps = 1;
+  /**@}*/
+
+  // ==================================
+  /// \name Test duration
+  /// The test runs until **both** min duration and min query count have been
+  /// met. However, it will exit before that point if **either** max duration or
+  /// max query count have been reached.
+  /**@{*/
+  uint64_t min_duration_ms = 10000;
+  uint64_t max_duration_ms = 0;  ///< 0: Infinity.
+  uint64_t min_query_count = 100;
+  uint64_t max_query_count = 0;  ///< 0: Infinity.
+  /**@}*/
+
+  // ==================================
+  /// \name Random number generation
+  /// There are 4 separate seeds, so each dimension can be changed
+  /// independently.
+  /**@{*/
+  /// \brief Affects which subset of samples from the QSL are chosen for
+  /// the performance sample set and accuracy sample sets.
+  uint64_t qsl_rng_seed = 0;
+  /// \brief Affects the order in which samples from the performance set will
+  /// be included in queries.
+  uint64_t sample_index_rng_seed = 0;
+  /// \brief Affects the poisson arrival process of the Server scenario.
+  /// \details Different seeds will appear to "jitter" the queries
+  /// differently in time, but should not affect the average issued QPS.
+  uint64_t schedule_rng_seed = 0;
+  /// \brief Affects which samples have their query returns logged to the
+  /// accuracy log in performance mode.
+  uint64_t accuracy_log_rng_seed = 0;
+
+  /// \brief Probability of the query response of a sample being logged to the
+  /// accuracy log in performance mode
+  double accuracy_log_probability = 0.0;
+
+  /// \brief Target number of samples that will have their results printed to
+  /// accuracy log in performance mode for compliance testing
+  uint64_t accuracy_log_sampling_target = 0;
+
+  /// \brief Load mlperf parameter config from file.
+  int FromConfig(const std::string &path, const std::string &model,
+                 const std::string &scenario);
+  /**@}*/
+
+  // ==================================
+  /// \name Performance Sample modifiers
+  /// \details These settings can be used to Audit Performance mode runs.
+  /// In order to detect sample caching by SUT, performance of runs when only
+  /// unique queries (with non-repeated samples) are issued can be compared with
+  /// that when the same query is repeatedly issued.
+  /**@{*/
+  /// \brief Prints measurement interval start and stop timestamps to std::cout
+  /// for the purpose of comparison against an external timer
+  bool print_timestamps = false;
+  /// \brief Allows issuing only unique queries in Performance mode of any
+  /// scenario \details This can be used to send non-repeat & hence unique
+  /// samples to SUT
+  bool performance_issue_unique = false;
+  /// \brief If true, the same query is chosen repeatedley for Inference.
+  /// In offline scenario, the query is filled with the same sample.
+  bool performance_issue_same = false;
+  /// \brief Offset to control which sample is repeated in
+  /// performance_issue_same mode.
+  /// Value should be within [0, performance_sample_count)
+  uint64_t performance_issue_same_index = 0;
+  /// \brief Overrides QSL->PerformanceSampleCount() when non-zero
+  uint64_t performance_sample_count_override = 0;
+  /**@}*/
+};
+
+///
+/// \enum LoggingMode
+/// Specifies how and when logging should be sampled and stringified at
+/// runtime.
+/// * **AsyncPoll**
+///  + Logs are serialized and output on an IOThread that polls for new logs at
+///  a fixed interval. This is the only mode currently implemented.
+/// * **EndOfTestOnly**
+///  + TODO: Logs are serialzied and output only at the end of the test.
+/// * **Synchronous**
+///  + TODO: Logs are serialized and output inline.
+enum class LoggingMode {
+  AsyncPoll,
+  EndOfTestOnly,
+  Synchronous,
+};
+
+///
+/// \brief Specifies where log outputs should go.
+///
+/// By default, the loadgen outputs its log files to outdir and
+/// modifies the filenames of its logs with a prefix and suffix.
+/// Filenames will take the form:
+/// "<outdir>/<datetime><prefix>summary<suffix>.txt"
+///
+/// Affordances for outputing logs to stdout are also provided.
+///
+struct LogOutputSettings {
+  std::string outdir = ".";
+  std::string prefix = "mlperf_log_";
+  std::string suffix = "";
+  bool prefix_with_datetime = false;
+  bool copy_detail_to_stdout = false;
+  bool copy_summary_to_stdout = false;
+};
+
+///
+/// \brief Top-level log settings.
+///
+struct LogSettings {
+  LogOutputSettings log_output;
+  LoggingMode log_mode = LoggingMode::AsyncPoll;
+  uint64_t log_mode_async_poll_interval_ms = 1000;  ///< TODO: Implement this.
+  bool enable_trace = true;
+};
+
+/// @}
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_TEST_SETTINGS_H
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/test_settings_internal.cc b/benchmarks/rnnt/ootb/inference/loadgen/test_settings_internal.cc
new file mode 100644
index 0000000..89c68b4
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/test_settings_internal.cc
@@ -0,0 +1,679 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "test_settings_internal.h"
+
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <string>
+
+#include "logging.h"
+#include "utils.h"
+
+namespace mlperf {
+namespace loadgen {
+
+TestSettingsInternal::TestSettingsInternal(
+    const TestSettings &requested_settings, size_t qsl_performance_sample_count)
+    : requested(requested_settings),
+      scenario(requested.scenario),
+      mode(requested.mode),
+      samples_per_query(1),
+      target_qps(1),
+      max_async_queries(0),
+      target_duration(std::chrono::milliseconds(requested.min_duration_ms)),
+      min_duration(std::chrono::milliseconds(requested.min_duration_ms)),
+      max_duration(std::chrono::milliseconds(requested.max_duration_ms)),
+      min_query_count(requested.min_query_count),
+      max_query_count(requested.max_query_count),
+      min_sample_count(0),
+      qsl_rng_seed(requested.qsl_rng_seed),
+      sample_index_rng_seed(requested.sample_index_rng_seed),
+      schedule_rng_seed(requested.schedule_rng_seed),
+      accuracy_log_rng_seed(requested.accuracy_log_rng_seed),
+      accuracy_log_probability(requested.accuracy_log_probability),
+      accuracy_log_sampling_target(requested.accuracy_log_sampling_target),
+      print_timestamps(requested.print_timestamps),
+      performance_issue_unique(requested.performance_issue_unique),
+      performance_issue_same(requested.performance_issue_same),
+      performance_issue_same_index(requested.performance_issue_same_index),
+      performance_sample_count(0) {
+  // Target QPS, target latency, and max_async_queries.
+  switch (requested.scenario) {
+    case TestScenario::SingleStream:
+      target_qps = static_cast<double>(std::nano::den) /
+                   requested.single_stream_expected_latency_ns;
+      max_async_queries = 1;
+      target_latency_percentile =
+          requested.single_stream_target_latency_percentile;
+      break;
+    case TestScenario::MultiStream: {
+      max_async_queries = requested.multi_stream_max_async_queries;
+      target_qps = requested.multi_stream_target_qps;
+      double target_latency_seconds =
+          max_async_queries / requested.multi_stream_target_qps;
+      target_latency =
+          SecondsToDuration<std::chrono::nanoseconds>(target_latency_seconds);
+      target_latency_percentile =
+          requested.multi_stream_target_latency_percentile;
+      break;
+    }
+    case TestScenario::MultiStreamFree:
+      max_async_queries = requested.multi_stream_max_async_queries;
+      target_qps = requested.multi_stream_target_qps;
+      target_latency =
+          std::chrono::nanoseconds(requested.multi_stream_target_latency_ns);
+      target_latency_percentile =
+          requested.multi_stream_target_latency_percentile;
+      break;
+    case TestScenario::Server:
+      if (requested.server_target_qps >= 0.0) {
+        target_qps = requested.server_target_qps;
+      } else {
+        LogDetail([
+          server_target_qps = requested.server_target_qps,
+          target_qps = target_qps
+        ](AsyncDetail & detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value for server_target_qps requested."
+             << " requested: " << server_target_qps
+             << " using: " << target_qps;
+          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+          detail.Error("Invalid value for server_target_qps requested.",
+                       "requested", server_target_qps, "using", target_qps);
+#endif
+        });
+      }
+      target_latency =
+          std::chrono::nanoseconds(requested.server_target_latency_ns);
+      target_latency_percentile = requested.server_target_latency_percentile;
+      max_async_queries = requested.server_max_async_queries;
+      break;
+    case TestScenario::Offline:
+      // target_latency_percentile is not used in Offline, but set it to
+      // 0.99 anyway to avoid garbage value.
+      target_latency_percentile = 0.99;
+      if (requested.offline_expected_qps >= 0.0) {
+        target_qps = requested.offline_expected_qps;
+      } else {
+        LogDetail([
+          offline_expected_qps = requested.offline_expected_qps,
+          target_qps = target_qps
+        ](AsyncDetail & detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value for offline_expected_qps requested."
+             << " requested: " << offline_expected_qps
+             << " using: " << target_qps;
+          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+          detail.Error("Invalid value for offline_expected_qps requested.",
+                       "requested", offline_expected_qps, "using", target_qps);
+#endif
+        });
+      }
+      max_async_queries = 1;
+      break;
+  }
+
+  // Performance Sample Count: TestSettings override QSL ->
+  // PerformanceSampleCount
+  performance_sample_count = (requested.performance_sample_count_override == 0)
+                                 ? qsl_performance_sample_count
+                                 : requested.performance_sample_count_override;
+
+  // Samples per query.
+  if (requested.scenario == TestScenario::MultiStream ||
+      requested.scenario == TestScenario::MultiStreamFree) {
+    samples_per_query = requested.multi_stream_samples_per_query;
+  }
+
+  // In the offline scenario, coalesce all queries into a single query.
+  if (requested.scenario == TestScenario::Offline) {
+    // TODO: Should the spec require a max duration for large query counts?
+    // kSlack is used to make sure we generate enough samples for the SUT
+    // to take longer than than the minimum test duration required by the
+    // MLPerf spec.
+    constexpr double kSlack = 1.1;
+    uint64_t target_sample_count =
+        kSlack * DurationToSeconds(target_duration) * target_qps;
+    samples_per_query =
+        (requested.performance_issue_unique || requested.performance_issue_same)
+            ? performance_sample_count
+            : std::max<uint64_t>(min_query_count, target_sample_count);
+    min_query_count = 1;
+    target_duration = std::chrono::milliseconds(0);
+  }
+
+  min_sample_count = min_query_count * samples_per_query;
+
+  // Validate TestSettings
+  if (requested.performance_issue_same &&
+      (requested.performance_issue_same_index >= performance_sample_count)) {
+    LogDetail([
+      performance_issue_same_index = requested.performance_issue_same_index,
+      performance_sample_count = performance_sample_count
+    ](AsyncDetail & detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Sample Idx to be repeated in performance_issue_same mode"
+             << " cannot be greater than loaded performance_sample_count."
+             << " performance_issue_same_index: "
+             << performance_issue_same_index
+             << " performance_sample_count: "
+             << performance_sample_count;
+          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+      detail.Error(
+          "Sample Idx to be repeated in performance_issue_same mode"
+          " cannot be greater than loaded performance_sample_count.",
+          "performance_issue_same_index", performance_issue_same_index,
+          "performance_sample_count", performance_sample_count);
+#endif
+    });
+  }
+
+  if (requested.performance_issue_unique && requested.performance_issue_same) {
+    LogDetail([
+      performance_issue_unique = requested.performance_issue_unique,
+      performance_issue_same = requested.performance_issue_same
+    ](AsyncDetail & detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Performance_issue_unique and performance_issue_same, both"
+             << " cannot be true at the same time."
+             << " performance_issue_unique: "
+             << performance_issue_unique
+             << " performance_issue_same: "
+             << performance_issue_same;
+          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+      detail.Error(
+          "Performance_issue_unique and performance_issue_same, both"
+          " cannot be true at the same time.",
+          "performance_issue_unique", performance_issue_unique,
+          "performance_issue_same", performance_issue_same);
+#endif
+    });
+  }
+}
+
+std::string ToString(TestScenario scenario) {
+  switch (scenario) {
+#if USE_NEW_LOGGING_FORMAT
+    case TestScenario::SingleStream:
+      return "SingleStream";
+    case TestScenario::MultiStream:
+      return "MultiStream";
+    case TestScenario::MultiStreamFree:
+      return "MultiStreamFree";
+#else
+    case TestScenario::SingleStream:
+      return "Single Stream";
+    case TestScenario::MultiStream:
+      return "Multi Stream";
+    case TestScenario::MultiStreamFree:
+      return "Multi Stream Free";
+#endif
+    case TestScenario::Server:
+      return "Server";
+    case TestScenario::Offline:
+      return "Offline";
+  }
+  assert(false);
+  return "InvalidScenario";
+}
+
+std::string ToString(TestMode mode) {
+  switch (mode) {
+#if USE_NEW_LOGGING_FORMAT
+    case TestMode::SubmissionRun:
+      return "SubmissionRun";
+    case TestMode::AccuracyOnly:
+      return "AccuracyOnly";
+    case TestMode::PerformanceOnly:
+      return "PerformanceOnly";
+    case TestMode::FindPeakPerformance:
+      return "FindPeakPerformance";
+#else
+    case TestMode::SubmissionRun:
+      return "Submission";
+    case TestMode::AccuracyOnly:
+      return "Accuracy";
+    case TestMode::PerformanceOnly:
+      return "Performance";
+    case TestMode::FindPeakPerformance:
+      return "Find Peak Performance";
+#endif
+  }
+  assert(false);
+  return "InvalidMode";
+}
+
+void LogRequestedTestSettings(const TestSettings &s) {
+  LogDetail([s](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "requested_scenario", ToString(s.scenario));
+    MLPERF_LOG(detail, "requested_test_mode", ToString(s.mode));
+
+    // Scenario-specific
+    switch (s.scenario) {
+      case TestScenario::SingleStream:
+        MLPERF_LOG(detail, "requested_single_stream_expected_latency_ns", s.single_stream_expected_latency_ns);
+        MLPERF_LOG(detail, "requested_single_stream_target_latency_percentile", s.single_stream_target_latency_percentile);
+        break;
+      case TestScenario::MultiStream:
+      case TestScenario::MultiStreamFree:
+        MLPERF_LOG(detail, "requested_multi_stream_target_qps", s.multi_stream_target_qps);
+        MLPERF_LOG(detail, "requested_multi_stream_target_latency_ns", s.multi_stream_target_latency_ns);
+        MLPERF_LOG(detail, "requested_multi_stream_target_latency_percentile", s.multi_stream_target_latency_percentile);
+        MLPERF_LOG(detail, "requested_multi_stream_samples_per_query", s.multi_stream_samples_per_query);
+        MLPERF_LOG(detail, "requested_multi_stream_max_async_queries", s.multi_stream_max_async_queries);
+        break;
+      case TestScenario::Server:
+        MLPERF_LOG(detail, "requested_server_target_qps", s.server_target_qps);
+        MLPERF_LOG(detail, "requested_server_target_latency_ns", s.server_target_latency_ns);
+        MLPERF_LOG(detail, "requested_server_target_latency_percentile", s.server_target_latency_percentile);
+        MLPERF_LOG(detail, "requested_server_coalesce_queries", s.server_coalesce_queries);
+        MLPERF_LOG(detail, "requested_server_find_peak_qps_decimals_of_precision", s.server_find_peak_qps_decimals_of_precision);
+        MLPERF_LOG(detail, "requested_server_find_peak_qps_boundary_step_size", s.server_find_peak_qps_boundary_step_size);
+        MLPERF_LOG(detail, "requested_server_max_async_queries", s.server_max_async_queries);
+        MLPERF_LOG(detail, "requested_server_num_issue_query_threads", s.server_num_issue_query_threads);
+        break;
+      case TestScenario::Offline:
+        MLPERF_LOG(detail, "requested_offline_expected_qps", s.offline_expected_qps);
+        break;
+    }
+
+    // Overrides
+    MLPERF_LOG(detail, "requested_min_duration_ms", s.min_duration_ms);
+    MLPERF_LOG(detail, "requested_max_duration_ms", s.max_duration_ms);
+    MLPERF_LOG(detail, "requested_min_query_count", s.min_query_count);
+    MLPERF_LOG(detail, "requested_max_query_count", s.max_query_count);
+    MLPERF_LOG(detail, "requested_qsl_rng_seed", s.qsl_rng_seed);
+    MLPERF_LOG(detail, "requested_sample_index_rng_seed", s.sample_index_rng_seed);
+    MLPERF_LOG(detail, "requested_schedule_rng_seed", s.schedule_rng_seed);
+    MLPERF_LOG(detail, "requested_accuracy_log_rng_seed", s.accuracy_log_rng_seed);
+    MLPERF_LOG(detail, "requested_accuracy_log_probability", s.accuracy_log_probability);
+    MLPERF_LOG(detail, "requested_accuracy_log_sampling_target", s.accuracy_log_sampling_target);
+    MLPERF_LOG(detail, "requested_print_timestamps", s.print_timestamps);
+    MLPERF_LOG(detail, "requested_performance_issue_unique", s.performance_issue_unique);
+    MLPERF_LOG(detail, "requested_performance_issue_same", s.performance_issue_same);
+    MLPERF_LOG(detail, "requested_performance_issue_same_index", s.performance_issue_same_index);
+    MLPERF_LOG(detail, "requested_performance_sample_count_override", s.performance_sample_count_override);
+#else
+    detail("");
+    detail("Requested Settings:");
+    detail("Scenario : " + ToString(s.scenario));
+    detail("Test mode : " + ToString(s.mode));
+
+    // Scenario-specific
+    switch (s.scenario) {
+      case TestScenario::SingleStream:
+        detail("single_stream_expected_latency_ns : ",
+               s.single_stream_expected_latency_ns);
+        detail("single_stream_target_latency_percentile : ",
+               s.single_stream_target_latency_percentile);
+        break;
+      case TestScenario::MultiStream:
+      case TestScenario::MultiStreamFree:
+        detail("multi_stream_target_qps : ", s.multi_stream_target_qps);
+        detail("multi_stream_target_latency_ns : ",
+               s.multi_stream_target_latency_ns);
+        detail("multi_stream_target_latency_percentile : ",
+               s.multi_stream_target_latency_percentile);
+        detail("multi_stream_samples_per_query : ",
+               s.multi_stream_samples_per_query);
+        detail("multi_stream_max_async_queries : ",
+               s.multi_stream_max_async_queries);
+        break;
+      case TestScenario::Server:
+        detail("server_target_qps : ", s.server_target_qps);
+        detail("server_target_latency_ns : ", s.server_target_latency_ns);
+        detail("server_target_latency_percentile : ",
+               s.server_target_latency_percentile);
+        detail("server_coalesce_queries : ", s.server_coalesce_queries);
+        detail("server_find_peak_qps_decimals_of_precision : ",
+               s.server_find_peak_qps_decimals_of_precision);
+        detail("server_find_peak_qps_boundary_step_size : ",
+               s.server_find_peak_qps_boundary_step_size);
+        detail("server_max_async_queries : ", s.server_max_async_queries);
+        detail("server_num_issue_query_threads : ",
+               s.server_num_issue_query_threads);
+        break;
+      case TestScenario::Offline:
+        detail("offline_expected_qps : ", s.offline_expected_qps);
+        break;
+    }
+
+    // Overrides
+    detail("min_duration_ms : ", s.min_duration_ms);
+    detail("max_duration_ms : ", s.max_duration_ms);
+    detail("min_query_count : ", s.min_query_count);
+    detail("max_query_count : ", s.max_query_count);
+    detail("qsl_rng_seed : ", s.qsl_rng_seed);
+    detail("sample_index_rng_seed : ", s.sample_index_rng_seed);
+    detail("schedule_rng_seed : ", s.schedule_rng_seed);
+    detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
+    detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
+    detail("print_timestamps : ", s.print_timestamps);
+    detail("performance_issue_unique : ", s.performance_issue_unique);
+    detail("performance_issue_same : ", s.performance_issue_same);
+    detail("performance_issue_same_index : ", s.performance_issue_same_index);
+    detail("performance_sample_count_override : ",
+           s.performance_sample_count_override);
+    detail("");
+#endif
+  });
+}
+
+void TestSettingsInternal::LogEffectiveSettings() const {
+  LogDetail([s = *this](AsyncDetail & detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "effective_scenario", ToString(s.scenario));
+    MLPERF_LOG(detail, "effective_test_mode", ToString(s.mode));
+
+    MLPERF_LOG(detail, "effective_samples_per_query", s.samples_per_query);
+    MLPERF_LOG(detail, "effective_target_qps", s.target_qps);
+    MLPERF_LOG(detail, "effective_target_latency_ns", s.target_latency.count());
+    MLPERF_LOG(detail, "effective_target_latency_percentile", s.target_latency_percentile);
+    MLPERF_LOG(detail, "effective_max_async_queries", s.max_async_queries);
+    MLPERF_LOG(detail, "effective_target_duration_ms", s.target_duration.count());
+    MLPERF_LOG(detail, "effective_min_duration_ms", s.min_duration.count());
+    MLPERF_LOG(detail, "effective_max_duration_ms", s.max_duration.count());
+    MLPERF_LOG(detail, "effective_min_query_count", s.min_query_count);
+    MLPERF_LOG(detail, "effective_max_query_count", s.max_query_count);
+    MLPERF_LOG(detail, "effective_min_sample_count", s.min_sample_count);
+    MLPERF_LOG(detail, "effective_qsl_rng_seed", s.qsl_rng_seed);
+    MLPERF_LOG(detail, "effective_sample_index_rng_seed", s.sample_index_rng_seed);
+    MLPERF_LOG(detail, "effective_schedule_rng_seed", s.schedule_rng_seed);
+    MLPERF_LOG(detail, "effective_accuracy_log_rng_seed", s.accuracy_log_rng_seed);
+    MLPERF_LOG(detail, "effective_accuracy_log_probability", s.accuracy_log_probability);
+    MLPERF_LOG(detail, "effective_accuracy_log_sampling_target", s.accuracy_log_sampling_target);
+    MLPERF_LOG(detail, "effective_print_timestamps", s.print_timestamps);
+    MLPERF_LOG(detail, "effective_performance_issue_unique", s.performance_issue_unique);
+    MLPERF_LOG(detail, "effective_performance_issue_same", s.performance_issue_same);
+    MLPERF_LOG(detail, "effective_performance_issue_same_index", s.performance_issue_same_index);
+    MLPERF_LOG(detail, "effective_performance_sample_count", s.performance_sample_count);
+#else
+    detail("");
+    detail("Effective Settings:");
+
+    detail("Scenario : " + ToString(s.scenario));
+    detail("Test mode : " + ToString(s.mode));
+
+    detail("samples_per_query : ", s.samples_per_query);
+    detail("target_qps : ", s.target_qps);
+    detail("target_latency (ns): ", s.target_latency.count());
+    detail("target_latency_percentile : ", s.target_latency_percentile);
+    detail("max_async_queries : ", s.max_async_queries);
+    detail("target_duration (ms): ", s.target_duration.count());
+    detail("min_duration (ms): ", s.min_duration.count());
+    detail("max_duration (ms): ", s.max_duration.count());
+    detail("min_query_count : ", s.min_query_count);
+    detail("max_query_count : ", s.max_query_count);
+    detail("min_sample_count : ", s.min_sample_count);
+    detail("qsl_rng_seed : ", s.qsl_rng_seed);
+    detail("sample_index_rng_seed : ", s.sample_index_rng_seed);
+    detail("schedule_rng_seed : ", s.schedule_rng_seed);
+    detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
+    detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
+    detail("print_timestamps : ", s.print_timestamps);
+    detail("performance_issue_unique : ", s.performance_issue_unique);
+    detail("performance_issue_same : ", s.performance_issue_same);
+    detail("performance_issue_same_index : ", s.performance_issue_same_index);
+    detail("performance_sample_count : ", s.performance_sample_count);
+#endif
+  });
+}
+
+void TestSettingsInternal::LogAllSettings() const {
+  LogRequestedTestSettings(requested);
+  LogEffectiveSettings();
+}
+
+void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
+  summary("samples_per_query : ", samples_per_query);
+  summary("target_qps : ", target_qps);
+  summary("target_latency (ns): ", target_latency.count());
+  summary("max_async_queries : ", max_async_queries);
+  summary("min_duration (ms): ", min_duration.count());
+  summary("max_duration (ms): ", max_duration.count());
+  summary("min_query_count : ", min_query_count);
+  summary("max_query_count : ", max_query_count);
+  summary("qsl_rng_seed : ", qsl_rng_seed);
+  summary("sample_index_rng_seed : ", sample_index_rng_seed);
+  summary("schedule_rng_seed : ", schedule_rng_seed);
+  summary("accuracy_log_rng_seed : ", accuracy_log_rng_seed);
+  summary("accuracy_log_probability : ", accuracy_log_probability);
+  summary("accuracy_log_sampling_target : ", accuracy_log_sampling_target);
+  summary("print_timestamps : ", print_timestamps);
+  summary("performance_issue_unique : ", performance_issue_unique);
+  summary("performance_issue_same : ", performance_issue_same);
+  summary("performance_issue_same_index : ", performance_issue_same_index);
+  summary("performance_sample_count : ", performance_sample_count);
+}
+
+}  // namespace loadgen
+
+/// \todo The TestSettings::FromConfig definition belongs in a test_settings.cc
+/// file which doesn't yet exist. To avoid churn so close to the submission
+/// deadline, adding a test_settings.cc file has been deferred to v0.6.
+int TestSettings::FromConfig(const std::string &path, const std::string &model,
+                             const std::string &scenario) {
+  // TODO: move this method to a new file test_settings.cc
+  std::map<std::string, std::string> kv;
+
+  // lookup key/value pairs from config
+  auto lookupkv = [&](const std::string &model, const std::string &scenario,
+                      const std::string &key, uint64_t *val_l, double *val_d,
+                      double multiplier = 1.0) {
+    std::map<std::string, std::string>::iterator it;
+    std::string found;
+    // lookup exact key first
+    it = kv.find(model + "." + scenario + "." + key);
+    if (it != kv.end()) {
+      found = it->second;
+    } else {
+      // lookup key with model wildcard
+      it = kv.find("*." + scenario + "." + key);
+      if (it != kv.end()) {
+        found = it->second;
+      } else {
+        it = kv.find(model + ".*." + key);
+        if (it != kv.end()) {
+          found = it->second;
+        } else {
+          it = kv.find("*.*." + key);
+          if (it != kv.end()) {
+            found = it->second;
+          } else {
+            return false;
+          }
+        }
+      }
+    }
+    // if we get here, found will be set
+    if (val_l) {
+      *val_l = strtoull(found.c_str(), nullptr, 0) *
+               static_cast<uint64_t>(multiplier);
+    }
+    if (val_d) *val_d = strtod(found.c_str(), nullptr) * multiplier;
+    return true;
+  };
+
+  // dirt simple config parser
+  std::ifstream fss(path);
+  std::string line;
+  int line_nr = 0;
+  int errors = 0;
+  if (!fss.is_open()) {
+    LogDetail([p = path](AsyncDetail & detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "can't open file " << p;
+      MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+      detail.Error("can't open file ", p);
+#endif
+    });
+    return -ENOENT;
+  }
+  while (std::getline(fss, line)) {
+    line_nr++;
+    std::istringstream iss(line);
+    std::string s, k;
+    int looking_for = 0;  // 0=key, 1=equal, 2=value
+    while (iss >> s) {
+      if (s == "#" && looking_for != 2) {
+        // done with this line
+        break;
+      }
+      if (looking_for == 2) {
+        // got key and value
+        const char *start = s.c_str();
+        char *stop;
+        (void)strtoul(start, &stop, 0);
+        if (start + s.size() == stop) {
+          kv[k] = s;
+          continue;
+        }
+        (void)strtod(start, &stop);
+        if (start + s.size() == stop) {
+          kv[k] = s;
+          continue;
+        }
+        errors++;
+        LogDetail([l = line_nr](AsyncDetail & detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "value needs to be integer or double, line=" << l;
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("value needs to be integer or double, line=", l);
+#endif
+        });
+        break;
+      }
+      if (looking_for == 1 && s != "=") {
+        errors++;
+        LogDetail([l = line_nr](AsyncDetail & detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "expected 'key=value', line=" << l;
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("expected 'key=value', line=", l);
+#endif
+        });
+        break;
+      }
+      if (looking_for == 0) k = s;
+      looking_for++;
+    }
+  }
+  if (errors != 0) return -EINVAL;
+
+  uint64_t val;
+
+  // keys that apply to all scenarios
+  if (lookupkv(model, scenario, "mode", &val, nullptr)) {
+    switch (val) {
+      case 0:
+        mode = TestMode::SubmissionRun;
+        break;
+      case 1:
+        mode = TestMode::AccuracyOnly;
+        break;
+      case 2:
+        mode = TestMode::PerformanceOnly;
+        break;
+      case 3:
+        mode = TestMode::FindPeakPerformance;
+        break;
+      default:
+        LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value passed to Mode key in config.";
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("Invalid value passed to Mode key in config.");
+#endif
+        });
+        break;
+    }
+  }
+  lookupkv(model, scenario, "min_duration", &min_duration_ms, nullptr);
+  lookupkv(model, scenario, "max_duration", &max_duration_ms, nullptr);
+  lookupkv(model, scenario, "min_query_count", &min_query_count, nullptr);
+  lookupkv(model, scenario, "max_query_count", &max_query_count, nullptr);
+  lookupkv(model, scenario, "qsl_rng_seed", &qsl_rng_seed, nullptr);
+  lookupkv(model, scenario, "sample_index_rng_seed", &sample_index_rng_seed,
+           nullptr);
+  lookupkv(model, scenario, "schedule_rng_seed", &schedule_rng_seed, nullptr);
+  lookupkv(model, scenario, "accuracy_log_rng_seed", &accuracy_log_rng_seed,
+           nullptr);
+  lookupkv(model, scenario, "accuracy_log_probability", nullptr,
+           &accuracy_log_probability, 0.01);
+  lookupkv(model, scenario, "accuracy_log_sampling_target",
+           &accuracy_log_sampling_target, nullptr);
+  if (lookupkv(model, scenario, "print_timestamps", &val, nullptr))
+    print_timestamps = (val == 0) ? false : true;
+  if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr))
+    performance_issue_unique = (val == 0) ? false : true;
+  if (lookupkv(model, scenario, "performance_issue_same", &val, nullptr))
+    performance_issue_same = (val == 0) ? false : true;
+  lookupkv(model, scenario, "performance_issue_same_index",
+           &performance_issue_same_index, nullptr);
+  lookupkv(model, scenario, "performance_sample_count_override",
+           &performance_sample_count_override, nullptr);
+
+  // keys that apply to SingleStream
+  lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
+           &single_stream_target_latency_percentile, 0.01);
+  lookupkv(model, "SingleStream", "target_latency",
+           &single_stream_expected_latency_ns, nullptr, 1000 * 1000);
+
+  // keys that apply to MultiStream
+  lookupkv(model, "MultiStream", "target_latency_percentile", nullptr,
+           &multi_stream_target_latency_percentile, 0.01);
+  lookupkv(model, "MultiStream", "target_qps", nullptr,
+           &multi_stream_target_qps);
+  if (lookupkv(model, "MultiStream", "samples_per_query", &val, nullptr))
+    multi_stream_samples_per_query = static_cast<int>(val);
+  if (lookupkv(model, "MultiStream", "max_async_queries", &val, nullptr))
+    multi_stream_max_async_queries = static_cast<int>(val);
+
+  // keys that apply to Server
+  lookupkv(model, "Server", "target_latency_percentile", nullptr,
+           &server_target_latency_percentile, 0.01);
+  lookupkv(model, "Server", "target_latency", &server_target_latency_ns,
+           nullptr, 1000 * 1000);
+  lookupkv(model, "Server", "target_qps", nullptr, &server_target_qps);
+  if (lookupkv(model, "Server", "coalesce_queries", &val, nullptr))
+    server_coalesce_queries = (val == 0) ? false : true;
+  if (lookupkv(model, "Server", "max_async_queries", &val, nullptr))
+    server_max_async_queries = int(val);
+
+  // keys that apply to Offline
+  lookupkv(model, "Offline", "target_qps", 0, &offline_expected_qps);
+
+  return 0;
+}
+
+}  // namespace mlperf
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/test_settings_internal.h b/benchmarks/rnnt/ootb/inference/loadgen/test_settings_internal.h
new file mode 100644
index 0000000..809e627
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/test_settings_internal.h
@@ -0,0 +1,193 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief The internal representation of user-provided settings.
+
+#ifndef MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
+#define MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
+
+#include <chrono>
+#include <cmath>
+#include <string>
+
+#include "logging.h"
+#include "test_settings.h"
+
+namespace mlperf {
+
+namespace logging {
+class AsyncSummary;
+}
+
+namespace loadgen {
+
+using AsyncSummary = logging::AsyncSummary;
+
+std::string ToString(TestScenario scenario);
+std::string ToString(TestMode mode);
+
+/// \brief takes the user-friendly TestSettings and normalizes it
+/// for consumption by the loadgen.
+/// \details It does things like remove scenario-specific naming and introduce
+/// the concept of target_duration used to pre-generate queries.
+struct TestSettingsInternal {
+  explicit TestSettingsInternal(const TestSettings &requested_settings,
+                                size_t qsl_performance_sample_count);
+  void LogEffectiveSettings() const;
+  void LogAllSettings() const;
+  void LogSummary(AsyncSummary &summary) const;
+
+  const TestSettings requested;
+  const TestScenario scenario;  // Copied here for convenience.
+  const TestMode mode;          // Copied here for convenience.
+
+  uint64_t samples_per_query;
+  double target_qps;
+  std::chrono::nanoseconds target_latency{0};
+  double target_latency_percentile;  // Single, multistream, and server modes.
+  uint64_t max_async_queries;
+
+  // Target duration is used to generate queries of a minimum duration before
+  // the test run.
+  std::chrono::milliseconds target_duration{0};
+
+  // Min duration/query_count/sample_count are used to validate the test
+  // duration at the end of the run.
+  std::chrono::milliseconds min_duration{0};
+  std::chrono::milliseconds max_duration{0};
+  uint64_t min_query_count;
+  uint64_t max_query_count;
+  uint64_t min_sample_count;  // Offline only.
+
+  uint64_t qsl_rng_seed;
+  uint64_t sample_index_rng_seed;
+  uint64_t schedule_rng_seed;
+  uint64_t accuracy_log_rng_seed;
+  double accuracy_log_probability;
+  uint64_t accuracy_log_sampling_target;
+  bool print_timestamps;
+  bool performance_issue_unique;
+  bool performance_issue_same;
+  uint64_t performance_issue_same_index;
+  uint64_t performance_sample_count;
+};
+
+/// \brief A namespace of collections of FindPeakPerformance helper functions,
+/// mainly about binary search.
+namespace find_peak_performance {
+
+constexpr char const *kNotSupportedMsg =
+    "Finding peak performance is only supported in MultiStream, "
+    "MultiStreamFree, and Server scenarios.";
+
+template <TestScenario scenario>
+TestSettingsInternal MidOfBoundaries(
+    const TestSettingsInternal &lower_bound_settings,
+    const TestSettingsInternal &upper_bound_settings) {
+  TestSettingsInternal mid_settings = lower_bound_settings;
+  if (scenario == TestScenario::MultiStream ||
+      scenario == TestScenario::MultiStreamFree) {
+    assert(lower_bound_settings.samples_per_query <
+           upper_bound_settings.samples_per_query);
+    mid_settings.samples_per_query = lower_bound_settings.samples_per_query +
+                                     (upper_bound_settings.samples_per_query -
+                                      lower_bound_settings.samples_per_query) /
+                                         2;
+  } else if (scenario == TestScenario::Server) {
+    assert(lower_bound_settings.target_qps < upper_bound_settings.target_qps);
+    mid_settings.target_qps =
+        lower_bound_settings.target_qps +
+        (upper_bound_settings.target_qps - lower_bound_settings.target_qps) / 2;
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+  }
+  return mid_settings;
+}
+
+template <TestScenario scenario>
+bool IsFinished(const TestSettingsInternal &lower_bound_settings,
+                const TestSettingsInternal &upper_bound_settings) {
+  if (scenario == TestScenario::MultiStream ||
+      scenario == TestScenario::MultiStreamFree) {
+    return lower_bound_settings.samples_per_query + 1 >=
+           upper_bound_settings.samples_per_query;
+  } else if (scenario == TestScenario::Server) {
+    uint8_t precision = lower_bound_settings.requested
+                            .server_find_peak_qps_decimals_of_precision;
+    double l =
+        std::floor(lower_bound_settings.target_qps * std::pow(10, precision));
+    double u =
+        std::floor(upper_bound_settings.target_qps * std::pow(10, precision));
+    return l + 1 >= u;
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+    return true;
+  }
+}
+
+template <TestScenario scenario>
+std::string ToStringPerformanceField(const TestSettingsInternal &settings) {
+  if (scenario == TestScenario::MultiStream ||
+      scenario == TestScenario::MultiStreamFree) {
+    return std::to_string(settings.samples_per_query);
+  } else if (scenario == TestScenario::Server) {
+    return std::to_string(settings.target_qps);
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+    return ToString(settings.scenario);
+  }
+}
+
+template <TestScenario scenario>
+void WidenPerformanceField(TestSettingsInternal *settings) {
+  if (scenario == TestScenario::MultiStream ||
+      scenario == TestScenario::MultiStreamFree) {
+    settings->samples_per_query = settings->samples_per_query * 2;
+  } else if (scenario == TestScenario::Server) {
+    settings->target_qps =
+        settings->target_qps *
+        (1 + settings->requested.server_find_peak_qps_boundary_step_size);
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+  }
+}
+
+}  // namespace find_peak_performance
+}  // namespace loadgen
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/tests/BUILD.gn b/benchmarks/rnnt/ootb/inference/loadgen/tests/BUILD.gn
new file mode 100644
index 0000000..d73bf83
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/tests/BUILD.gn
@@ -0,0 +1,25 @@
+static_library("mlperf_loadgen_tests_loadgen_test_main") {
+  sources = [ "loadgen_test.h", "loadgen_test_main.cc" ]
+  configs += [ "//build/config/compiler:exceptions" ]
+}
+
+executable("mlperf_loadgen_perftests") {
+  sources = [ "perftests_null_sut.cc" ]
+  deps = [ "..:mlperf_loadgen" ]
+}
+
+executable("mlperf_loadgen_tests_basic") {
+  sources = [ "basic.cc" ]
+  deps = [ "..:mlperf_loadgen",
+           ":mlperf_loadgen_tests_loadgen_test_main"  ]
+  configs += [ "//build/config/compiler:exceptions" ]
+}
+
+source_set("mlperf_loadgen_perftests_py") {
+  sources = [ "perftests_null_sut.py" ]
+  deps = [ "../..:loadgen_pymodule_wheel_lib" ]
+}
+
+source_set("docs") {
+  sources = [ "README.md" ]
+}
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/tests/README.md b/benchmarks/rnnt/ootb/inference/loadgen/tests/README.md
new file mode 100644
index 0000000..41056b4
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/tests/README.md
@@ -0,0 +1,42 @@
+# Building and Running the Tests {#ReadmeTests}
+
+The unit and performance tests are only supported via gn/ninja at the moment.
+
+See the [top-level build readme](@ref ReadmeBuild) for details but, from a clean checkout, you must first run:
+
+    make bootstrap_gn_ninja
+    third_party/gn/gn gen out/Release --args="is_debug=false"
+
+This will build the gn and ninja build tools and create a release project.
+
+## Unit Tests
+
+To build:
+
+    third_party/ninja/ninja -C out/Release mlperf_loadgen_tests_basic
+
+To run all tests:
+
+    out/Release/mlperf_loadgen_tests_basic .
+
+To run specific tests:
+
+    out/Release/mlperf_loadgen_tests_basic <regex>
+    e.g.:
+    out/Release/mlperf_loadgen_tests_basic SingleStream
+
+## Performance Tests
+
+To build:
+
+    third_party/ninja/ninja -C out/Release mlperf_loadgen_perftests
+
+To run all tests:
+
+    out/Release/mlperf_loadgen_perftests .
+
+To run specific tests:
+
+    out/Release/mlperf_loadgen_perftests <regex>
+    e.g.:
+    out/Release/mlperf_loadgen_tests_basic ServerPool
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/tests/basic.cc b/benchmarks/rnnt/ootb/inference/loadgen/tests/basic.cc
new file mode 100644
index 0000000..bfe57fe
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/tests/basic.cc
@@ -0,0 +1,335 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Basic functionality unit tests.
+
+#include <algorithm>
+#include <deque>
+#include <future>
+#include <iostream>
+#include <queue>
+#include <unordered_set>
+#include <vector>
+
+#include "../loadgen.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+#include "loadgen_test.h"
+
+/// \brief Correctness unit tests.
+namespace unit_tests {
+
+/// \defgroup LoadgenTestsBasic Test Coverage: Basic
+
+/// \brief Implements the client interfaces of the loadgen and
+/// has some basic sanity checks that are enabled for all tests.
+/// \details It also forwards calls to overrideable *Ext methods and implements
+/// the TestProxy concept.
+struct SystemUnderTestBasic : public mlperf::QuerySampleLibrary,
+                              public mlperf::SystemUnderTest {
+  const std::string& Name() const override { return name_; }
+
+  size_t TotalSampleCount() override { return total_sample_count_; }
+  size_t PerformanceSampleCount() override { return performance_sample_count_; }
+
+  void LoadSamplesToRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    for (auto s : samples) {
+      samples_load_count_.at(s)++;
+      loaded_samples_.push_back(s);
+    }
+    LoadSamplesToRamExt(samples);
+  }
+  virtual void LoadSamplesToRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) {}
+
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    for (auto s : samples) {
+      FAIL_IF(loaded_samples_.front() != s) &&
+          FAIL_EXP(loaded_samples_.front()) && FAIL_EXP(s);
+      loaded_samples_.pop_front();
+      size_t prev_load_count = samples_load_count_.at(s)--;
+      FAIL_IF(prev_load_count == 0) && FAIL_EXP(prev_load_count);
+    }
+    UnloadSamplesFromRamExt(samples);
+  }
+  virtual void UnloadSamplesFromRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) {}
+
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::vector<mlperf::QuerySampleResponse> responses;
+    query_sizes_.push_back(samples.size());
+    samples_between_flushes_.back() += samples.size();
+    responses.reserve(samples.size());
+    for (auto s : samples) {
+      FAIL_IF(samples_load_count_.at(s.index) == 0) &&
+          FAIL_MSG("Issued unloaded sample:") && FAIL_EXP(s.index);
+      samples_issue_count_.at(s.index)++;
+      issued_samples_.push_back(s.index);
+      responses.push_back({s.id, 0, 0});
+    }
+    mlperf::QuerySamplesComplete(responses.data(), responses.size());
+    IssueQueryExt(samples);
+  }
+  virtual void IssueQueryExt(const std::vector<mlperf::QuerySample>& samples) {}
+
+  void FlushQueries() override {
+    samples_between_flushes_.push_back(0);
+    FlushQueriesExt();
+  }
+  virtual void FlushQueriesExt() {}
+
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override {}
+
+  virtual void RunTest() {
+    samples_load_count_.resize(total_sample_count_, 0);
+    samples_issue_count_.resize(total_sample_count_, 0);
+    samples_between_flushes_.resize(1, 0);
+    mlperf::StartTest(this, this, test_settings_, log_settings_);
+  }
+
+  virtual void EndTest() {}
+
+ protected:
+  mlperf::TestSettings test_settings_;
+  mlperf::LogSettings log_settings_;
+
+  std::string name_{"BasicSUT"};
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  std::vector<mlperf::QuerySampleIndex> issued_samples_;
+  std::deque<mlperf::QuerySampleIndex> loaded_samples_;
+  std::vector<size_t> samples_load_count_;
+  std::vector<size_t> samples_issue_count_;
+
+  std::vector<size_t> query_sizes_;
+  std::vector<size_t> samples_between_flushes_;
+};
+
+/// \brief Provides common test set up logic.
+struct SystemUnderTestAccuracy : public SystemUnderTestBasic {
+  virtual void SetUpTest(size_t samples_per_query,
+                         size_t samples_per_query_remainder,
+                         size_t accuracy_remainder,
+                         mlperf::TestScenario scenario) {
+    performance_sample_count_ =
+        samples_per_query * 16 + samples_per_query_remainder;
+    total_sample_count_ = performance_sample_count_ * 32 + accuracy_remainder;
+
+    log_settings_.log_output.prefix_with_datetime = false;
+
+    test_settings_.scenario = scenario;
+    test_settings_.mode = mlperf::TestMode::AccuracyOnly;
+    test_settings_.multi_stream_samples_per_query = samples_per_query;
+
+    double qps = 1e3;
+    test_settings_.server_target_qps = qps;
+    test_settings_.multi_stream_target_qps = qps;
+  }
+};
+
+/// \brief Verifies all samples from the QSL are included at least once
+/// in accuracy mode.
+/// \ingroup LoadgenTestsBasic
+struct TestAccuracyIncludesAllSamples : public SystemUnderTestAccuracy {
+  void EndTest() override {
+    std::sort(issued_samples_.begin(), issued_samples_.end());
+
+    FAIL_IF(issued_samples_.size() < total_sample_count_) &&
+        FAIL_EXP(issued_samples_.size()) && FAIL_EXP(total_sample_count_);
+    FAIL_IF(issued_samples_.front() != 0) && FAIL_EXP(issued_samples_.front());
+    FAIL_IF(issued_samples_.back() != total_sample_count_ - 1) &&
+        FAIL_EXP(issued_samples_.back()) && FAIL_EXP(total_sample_count_);
+
+    mlperf::QuerySampleIndex prev = -1;
+    size_t discontinuities = 0;
+    size_t dupes = 0;
+    for (auto s : issued_samples_) {
+      if (s == prev) {
+        dupes++;
+      } else if (s - prev > 1) {
+        discontinuities++;
+      }
+      prev = s;
+    }
+
+    FAIL_IF(discontinuities != 0) && FAIL_EXP(discontinuities);
+    if (test_settings_.scenario == mlperf::TestScenario::MultiStream ||
+        test_settings_.scenario == mlperf::TestScenario::MultiStreamFree) {
+      const size_t expected_sets =
+          total_sample_count_ / performance_sample_count_;
+      FAIL_IF(dupes >=
+              test_settings_.multi_stream_samples_per_query * expected_sets) &&
+          FAIL_EXP(dupes);
+    } else {
+      FAIL_IF(dupes != 0) && FAIL_EXP(dupes);
+    }
+  }
+};
+
+REGISTER_TEST_ALL_SCENARIOS(AccuracyIncludesAllSamples,
+                            TestProxy<TestAccuracyIncludesAllSamples>(), 4, 0,
+                            0);
+
+/// \brief Verifies samples from the QSL aren't included too many times.
+/// \details This is a regression test for:
+/// https://github.com/mlperf/inference/pull/386
+/// The root cause was using different values for samples_per_query while
+/// generating queries for the GNMT dataset.
+/// \ingroup LoadgenTestsBasic
+struct TestAccuracyDupesAreLimitted : public SystemUnderTestAccuracy {
+  void SetUpTest(bool, mlperf::TestScenario scenario) {
+    SystemUnderTestAccuracy::SetUpTest(4, 0, 0, scenario);
+    total_sample_count_ = 3003;
+    performance_sample_count_ = 1001;
+  }
+
+  void EndTest() override {
+    std::sort(issued_samples_.begin(), issued_samples_.end());
+
+    FAIL_IF(issued_samples_.size() < total_sample_count_) &&
+        FAIL_EXP(issued_samples_.size()) && FAIL_EXP(total_sample_count_);
+    FAIL_IF(issued_samples_.front() != 0) && FAIL_EXP(issued_samples_.front());
+    FAIL_IF(issued_samples_.back() != total_sample_count_ - 1) &&
+        FAIL_EXP(issued_samples_.back()) && FAIL_EXP(total_sample_count_);
+
+    std::vector<size_t> issue_counts(total_sample_count_, 0);
+    for (auto s : issued_samples_) {
+      issue_counts.at(s)++;
+    }
+
+    const bool multistream =
+        test_settings_.scenario == mlperf::TestScenario::MultiStream ||
+        test_settings_.scenario == mlperf::TestScenario::MultiStreamFree;
+    const size_t max_count = multistream ? 2 : 1;
+
+    for (size_t i = 0; i < issue_counts.size(); i++) {
+      FAIL_IF(issue_counts[i] > max_count) && FAIL_EXP(i) &&
+          FAIL_EXP(max_count) && FAIL_EXP(issue_counts[i]);
+    }
+  }
+};
+
+REGISTER_TEST_ALL_SCENARIOS(TestAccuracyDupesAreLimitted,
+                            TestProxy<TestAccuracyDupesAreLimitted>(), true);
+
+/// \brief Verifies offline + accuracy doesn't hang if the last set
+/// in the accuracy series is smaller than others.
+/// \ingroup LoadgenTestsBasic
+struct TestOfflineRemainderAccuracySet : public SystemUnderTestAccuracy {
+  void SetUpTest() {
+    SystemUnderTestAccuracy::SetUpTest(4, 0, 7, mlperf::TestScenario::Offline);
+  }
+
+  void EndTest() override {
+    auto& flush_samples = samples_between_flushes_;
+
+    FAIL_IF(flush_samples.size() < 3) && FAIL_EXP(flush_samples.size()) &&
+        BAD_TEST_MSG("Test should generate multiple query sets.") && ABORT_TEST;
+
+    // The last counter will be 0, since a test ends with a call to
+    // FlushQuery.
+    FAIL_IF(flush_samples.back() != 0) && FAIL_EXP(flush_samples.back()) &&
+        FAIL_MSG(
+            "Detected stray calls to IssueQuery after the last call to "
+            "FlushQuery.");
+    flush_samples.pop_back();
+
+    // Verify the test ran with a smaller last accuracy set.
+    size_t first_size = flush_samples.front();
+    size_t last_size = flush_samples.back();
+    FAIL_IF(first_size <= last_size) && FAIL_EXP(first_size) &&
+        FAIL_EXP(last_size) && BAD_TEST_MSG();
+
+    flush_samples.pop_back();  // Don't check the last set for equality.
+    for (size_t query_size : flush_samples) {
+      FAIL_IF(query_size != first_size) && FAIL_EXP(query_size) &&
+          FAIL_EXP(first_size);
+    }
+  }
+};
+
+REGISTER_TEST(Offline_RemainderAccuracySets,
+              TestProxy<TestOfflineRemainderAccuracySet>());
+
+/// \brief Verifies all queries only contain samples that are contiguous,
+/// even if the set size is not a multiple of samples_per_query.
+/// \ingroup LoadgenTestsBasic
+struct TestMultiStreamContiguousRemainderQuery
+    : public SystemUnderTestAccuracy {
+  void SetUpTest(mlperf::TestScenario scenario) {
+    SystemUnderTestAccuracy::SetUpTest(4, 1, 0, scenario);
+    first_qsl_offsets_.resize(total_sample_count_, kBadQslOffset);
+
+    auto spq = test_settings_.multi_stream_samples_per_query;
+    FAIL_IF(performance_sample_count_ % spq == 0) &&
+        FAIL_EXP(performance_sample_count_) && FAIL_EXP(spq) &&
+        BAD_TEST_MSG("There is no remainder.");
+  }
+
+  void LoadSamplesToRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    FAIL_IF(loaded_samples_.size() != samples.size()) &&
+        FAIL_MSG("Contiguous sample order is likely ambiguous.");
+    for (size_t i = 0; i < samples.size(); i++) {
+      auto& offset = first_qsl_offsets_.at(samples.at(i));
+      // Samples may be loaded into multiple slots for paddign purposes,
+      // so make sure to only index the first time a sample appears in a
+      // loaded set.
+      if (offset == kBadQslOffset) {
+        offset = i;
+      }
+    }
+  }
+
+  void UnloadSamplesFromRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    FAIL_IF(!loaded_samples_.empty()) &&
+        FAIL_MSG("Contiguous sample order is likely ambiguous.");
+    for (size_t i = 0; i < samples.size(); i++) {
+      first_qsl_offsets_.at(samples.at(i)) = kBadQslOffset;
+    }
+  }
+
+  void IssueQueryExt(const std::vector<mlperf::QuerySample>& samples) override {
+    size_t expected_offset = first_qsl_offsets_[samples[0].index];
+    for (auto s : samples) {
+      FAIL_IF(loaded_samples_[expected_offset] != s.index) &&
+          FAIL_MSG("Samples are not contiguous.");
+      expected_offset++;
+    }
+  }
+
+  void FlushQueriesExt() override {}
+
+  void EndTest() override {}
+
+ private:
+  static const size_t kBadQslOffset;
+  std::vector<size_t> first_qsl_offsets_;
+};
+
+constexpr size_t TestMultiStreamContiguousRemainderQuery::kBadQslOffset =
+    std::numeric_limits<size_t>::max();
+
+REGISTER_TEST(MultiStream_RemainderQueryContiguous,
+              TestProxy<TestMultiStreamContiguousRemainderQuery>(),
+              mlperf::TestScenario::MultiStream);
+REGISTER_TEST(MultiStreamFree_RemainderQueryContiguous,
+              TestProxy<TestMultiStreamContiguousRemainderQuery>(),
+              mlperf::TestScenario::MultiStreamFree);
+
+}  // namespace unit_tests
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/tests/loadgen_test.h b/benchmarks/rnnt/ootb/inference/loadgen/tests/loadgen_test.h
new file mode 100644
index 0000000..433ff35
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/tests/loadgen_test.h
@@ -0,0 +1,199 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A minimal test framework.
+
+#ifndef MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
+#define MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
+
+#include <algorithm>
+#include <exception>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <vector>
+
+#define REGISTER_TEST(name, ...) \
+  static Test::StaticRegistrant test##name(#name, __VA_ARGS__);
+
+#define REGISTER_TEST_SCENARIO(name, scenario, test, ...) \
+  static Test::StaticRegistrant t##name##scenario(        \
+      #name "_" #scenario, test, __VA_ARGS__, mlperf::TestScenario::scenario)
+
+#define REGISTER_TEST_ALL_SCENARIOS(name, test, ...)                \
+  REGISTER_TEST_SCENARIO(name, SingleStream, test, __VA_ARGS__);    \
+  REGISTER_TEST_SCENARIO(name, MultiStream, test, __VA_ARGS__);     \
+  REGISTER_TEST_SCENARIO(name, MultiStreamFree, test, __VA_ARGS__); \
+  REGISTER_TEST_SCENARIO(name, Server, test, __VA_ARGS__);          \
+  REGISTER_TEST_SCENARIO(name, Offline, test, __VA_ARGS__);
+
+#define FAIL_IF(exp)                                              \
+  [&]() {                                                         \
+    const bool v = exp;                                           \
+    if (v) {                                                      \
+      std::cerr << "\n   ERROR: (" << __FILE__ << "@" << __LINE__ \
+                << ") : " #exp;                                   \
+      Test::AddFailure();                                         \
+    }                                                             \
+    return v;                                                     \
+  }()
+
+#define FAIL_MSG(...)                                                      \
+  [&]() {                                                                  \
+    std::cerr << "\n    Info: (" << __FILE__ << "@" << __LINE__ << ") : "; \
+    Test::Log(__VA_ARGS__);                                                \
+    return true;                                                           \
+  }()
+
+#define FAIL_EXP(exp)                                                      \
+  [&]() {                                                                  \
+    std::cerr << "\n    Info: (" << __FILE__ << "@" << __LINE__ << ") : "; \
+    std::cerr << #exp << " is " << (exp);                                  \
+    return true;                                                           \
+  }()
+
+#define BAD_TEST_MSG(...)                                        \
+  [&]() {                                                        \
+    FAIL_MSG("The test isn't testing what it claims to test. "); \
+    Test::Log(__VA_ARGS__);                                      \
+    return true;                                                 \
+  }()
+
+#define ABORT_TEST                                     \
+  [&]() {                                              \
+    FAIL_MSG("ABORTING");                              \
+    throw std::logic_error("ABORT_TEST encountered."); \
+    return false;                                      \
+  }();
+
+/// \brief Testing utilities.
+namespace testing {
+
+/// \brief Wraps a test class as a functor for easy registration.
+/// Forwards registration args to a SetUpTest method.
+/// \details Calls SetUpTest, RunTest, and EndTest.
+template <typename TestT>
+struct TestProxy {
+  template <typename... Args>
+  void operator()(Args&&... args) {
+    TestT test;
+    test.SetUpTest(std::forward<Args>(args)...);
+    test.RunTest();
+    test.EndTest();
+  }
+};
+
+/// \brief A collection of methods for registering and running tests.
+class Test {
+  /// \brief Maps registered test names to a callback.
+  using TestMap = std::multimap<const char*, std::function<void()>>;
+
+  /// \brief The registered tests.
+  /// \details Wraps a static local to avoid undefined initialization order
+  /// and guarantee it is initialized before the first test registers itself.
+  static TestMap& tests() {
+    static TestMap tests_;
+    return tests_;
+  }
+
+  /// \brief The number of errors the current test has encountered.
+  static size_t& test_fails() {
+    static size_t test_fails_ = 0;
+    return test_fails_;
+  }
+
+ public:
+  /// \brief Registers a test before main() starts during static initialization.
+  struct StaticRegistrant {
+    template <typename... Args>
+    StaticRegistrant(Args&&... args) {
+      Test::Register(std::forward<Args>(args)...);
+    }
+  };
+
+  /// \brief Registers a test at runtime.
+  template <typename TestF, typename... Args>
+  static void Register(const char* name, TestF test, Args&&... args) {
+    std::function<void()> test_closure =
+        std::bind(test, std::forward<Args>(args)...);
+    tests().insert({std::move(name), std::move(test_closure)});
+  }
+
+  /// \brief Runs all currently registered tests that match the given filter.
+  static int Run(std::function<bool(const char*)> filter) {
+    // Determine which tests are enabled.
+    std::vector<TestMap::value_type*> enabled_tests;
+    for (auto& test : tests()) {
+      if (filter(test.first)) {
+        enabled_tests.push_back(&test);
+      }
+    }
+    const size_t enabled = enabled_tests.size();
+    std::cout << enabled << " of " << tests().size() << " tests enabled.\n";
+
+    // Run the tests.
+    std::vector<const char*> failures;
+    for (size_t i = 0; i < enabled; i++) {
+      const char* name = enabled_tests[i]->first;
+      std::cout << "[" << (i + 1) << "/" << enabled << "] : " << name << " : ";
+      std::cout.flush();
+      test_fails() = 0;
+      try {
+        enabled_tests[i]->second();  // Run the test.
+      } catch (std::exception& e) {
+        constexpr bool TestThrewException = true;
+        FAIL_IF(TestThrewException) && FAIL_EXP(e.what());
+      }
+      if (test_fails() > 0) {
+        failures.push_back(name);
+        std::cerr << "\n FAILED: " << name << "\n";
+      } else {
+        std::cout << "SUCCESS\n";
+      }
+    }
+
+    // Summarize.
+    if (enabled_tests.empty()) {
+      std::cerr << "Check your test filter.\n";
+    } else if (failures.empty()) {
+      std::cout << "All " << enabled << " tests passed! \\o/\n";
+    } else {
+      std::cout << failures.size() << " of " << enabled << " tests failed:\n";
+      for (auto failed_test_name : failures) {
+        std::cout << "  " << failed_test_name << "\n";
+      }
+    }
+    return failures.size();
+  }
+
+  /// \brief Used by test macros to flag test failure.
+  static void AddFailure() { test_fails()++; }
+
+  /// \brief Base case for the variadic version of Log.
+  static void Log() {}
+
+  /// \brief Used by test macros to log an arbitrary list of args.
+  template <typename T, typename... Args>
+  static void Log(T&& v, Args&&... args) {
+    std::cerr << v;
+    Log(std::forward<Args>(args)...);
+  }
+};
+
+}  // namespace testing
+
+// The testing namespace exists for documentation purposes.
+// Export the testing namespace for all files that define tests.
+using namespace testing;
+
+#endif  // MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/tests/loadgen_test_main.cc b/benchmarks/rnnt/ootb/inference/loadgen/tests/loadgen_test_main.cc
new file mode 100644
index 0000000..3dc5afa
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/tests/loadgen_test_main.cc
@@ -0,0 +1,33 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A main entry point a test binary can use if it just wants to execute
+/// Test::Run on all statically registered tests.
+
+#include <regex>
+
+#include "loadgen_test.h"
+
+int main(int argc, char* argv[]) {
+  if (argc <= 1) {
+    std::cerr << "Usage: " << argv[0] << " <include_regex> <exclude_regex>\n";
+    return -1;
+  }
+  std::regex include_regex(argc >= 2 ? argv[1] : ".*");
+  std::regex exclude_regex(argc >= 3 ? std::regex(argv[2]) : std::regex());
+  auto test_filter = [&](const char* test_name) {
+    return (std::regex_search(test_name, include_regex) &&
+            !std::regex_search(test_name, exclude_regex));
+  };
+  return Test::Run(test_filter);
+}
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/tests/perftests_null_sut.cc b/benchmarks/rnnt/ootb/inference/loadgen/tests/perftests_null_sut.cc
new file mode 100644
index 0000000..3007226
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/tests/perftests_null_sut.cc
@@ -0,0 +1,236 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Performance tests using a null backend.
+
+#include <future>
+
+#include "../loadgen.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+
+/// \brief Performance unit tests.
+namespace perf_tests {
+
+/// \defgroup LoadgenTestsPerformance Test Coverage: Performance
+
+/// \brief A simple SUT implemenatation that immediately completes
+/// issued queries sychronously ASAP.
+class SystemUnderTestNull : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNull() = default;
+  ~SystemUnderTestNull() override = default;
+  const std::string& Name() const override { return name_; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::vector<mlperf::QuerySampleResponse> responses;
+    responses.reserve(samples.size());
+    for (auto s : samples) {
+      responses.push_back({s.id, 0, 0});
+    }
+    mlperf::QuerySamplesComplete(responses.data(), responses.size());
+  }
+
+  void FlushQueries() override {}
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override {}
+
+ private:
+  std::string name_{"NullSUT"};
+};
+
+/// \brief A stub implementation of QuerySampleLibrary.
+class QuerySampleLibraryNull : public mlperf::QuerySampleLibrary {
+ public:
+  QuerySampleLibraryNull() = default;
+  ~QuerySampleLibraryNull() = default;
+  const std::string& Name() const override { return name_; }
+
+  size_t TotalSampleCount() override { return 1024 * 1024; }
+
+  size_t PerformanceSampleCount() override { return 1024; }
+
+  void LoadSamplesToRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    return;
+  }
+
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    return;
+  }
+
+ private:
+  std::string name_{"NullQSL"};
+};
+
+/// \brief Runs single stream traffic.
+/// \ingroup LoadgenTestsPerformance
+void TestSingleStream() {
+  SystemUnderTestNull null_sut;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+
+  mlperf::TestSettings ts;
+
+  mlperf::StartTest(&null_sut, &null_qsl, ts, log_settings);
+}
+
+/// \brief A SUT implementation that completes queries asynchronously using
+/// std::async.
+class SystemUnderTestNullStdAsync : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNullStdAsync() { futures_.reserve(1000000); }
+  ~SystemUnderTestNullStdAsync() override = default;
+  const std::string& Name() const override { return name_; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    futures_.emplace_back(std::async(std::launch::async, [samples] {
+      std::vector<mlperf::QuerySampleResponse> responses;
+      responses.reserve(samples.size());
+      for (auto s : samples) {
+        responses.push_back({s.id, 0, 0});
+      }
+      mlperf::QuerySamplesComplete(responses.data(), responses.size());
+    }));
+  }
+
+  void FlushQueries() override {}
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override {}
+
+ private:
+  std::string name_{"NullStdAsync"};
+  std::vector<std::future<void>> futures_;
+};
+
+/// \brief Tests server traffic using SystemUnderTestNullStdAsync.
+/// \ingroup LoadgenTestsPerformance
+void TestServerStdAsync() {
+  SystemUnderTestNullStdAsync null_std_async_sut;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+  log_settings.log_output.copy_summary_to_stdout = true;
+
+  mlperf::TestSettings ts;
+  ts.scenario = mlperf::TestScenario::Server;
+  ts.server_target_qps = 2000000;
+  ts.min_duration_ms = 100;
+
+  mlperf::StartTest(&null_std_async_sut, &null_qsl, ts, log_settings);
+}
+
+/// \brief A SUT implementation that completes queries asynchronously using
+/// an explicitly managed thread pool.
+class SystemUnderTestNullPool : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNullPool() {
+    samples_.reserve(kReserveSampleSize);
+    next_poll_time_ = std::chrono::high_resolution_clock::now() + poll_period_;
+    for (size_t i = 0; i < thread_count_; i++) {
+      threads_.emplace_back(&SystemUnderTestNullPool::WorkerThread, this);
+    }
+  }
+
+  ~SystemUnderTestNullPool() override {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      keep_workers_alive_ = false;
+    }
+    cv_.notify_all();
+    for (auto& thread : threads_) {
+      thread.join();
+    }
+  }
+
+  const std::string& Name() const override { return name_; }
+
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::unique_lock<std::mutex> lock(mutex_);
+    samples_.insert(samples_.end(), samples.begin(), samples.end());
+  }
+
+  void FlushQueries() override {}
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override {}
+
+ private:
+  void WorkerThread() {
+    std::vector<mlperf::QuerySample> my_samples;
+    my_samples.reserve(kReserveSampleSize);
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (keep_workers_alive_) {
+      next_poll_time_ += poll_period_;
+      auto my_wakeup_time = next_poll_time_;
+      cv_.wait_until(lock, my_wakeup_time,
+                     [&]() { return !keep_workers_alive_; });
+      my_samples.swap(samples_);
+      lock.unlock();
+
+      std::vector<mlperf::QuerySampleResponse> responses;
+      responses.reserve(my_samples.size());
+      for (auto s : my_samples) {
+        responses.push_back({s.id, 0, 0});
+      }
+      mlperf::QuerySamplesComplete(responses.data(), responses.size());
+
+      lock.lock();
+      my_samples.clear();
+    }
+  }
+
+  static constexpr size_t kReserveSampleSize = 1024 * 1024;
+  const std::string name_{"NullPool"};
+  const size_t thread_count_ = 4;
+  const std::chrono::milliseconds poll_period_{1};
+  std::chrono::high_resolution_clock::time_point next_poll_time_;
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  bool keep_workers_alive_ = true;
+  std::vector<std::thread> threads_;
+
+  std::vector<mlperf::QuerySample> samples_;
+};
+
+/// \brief Tests server traffic using SystemUnderTestNullPool.
+/// \ingroup LoadgenTestsPerformance
+void TestServerPool() {
+  SystemUnderTestNullPool null_pool;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+  log_settings.log_output.copy_summary_to_stdout = true;
+
+  mlperf::TestSettings ts;
+  ts.scenario = mlperf::TestScenario::Server;
+  ts.server_target_qps = 2000000;
+  ts.min_duration_ms = 100;
+
+  mlperf::StartTest(&null_pool, &null_qsl, ts, log_settings);
+}
+
+/// @}
+
+}  // namespace perf_tests
+
+int main(int argc, char* argv[]) {
+  perf_tests::TestSingleStream();
+  perf_tests::TestServerStdAsync();
+  perf_tests::TestServerPool();
+  return 0;
+}
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/tests/perftests_null_sut.py b/benchmarks/rnnt/ootb/inference/loadgen/tests/perftests_null_sut.py
new file mode 100644
index 0000000..457a2d9
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/tests/perftests_null_sut.py
@@ -0,0 +1,71 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python version of perftests_null_sut.cc.
+"""
+
+from __future__ import print_function
+from absl import app
+import mlperf_loadgen
+import numpy
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def issue_query(query_samples):
+    responses = []
+    for s in query_samples:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def flush_queries():
+    pass
+
+
+def process_latencies(latencies_ns):
+    print("Average latency: ")
+    print(numpy.mean(latencies_ns))
+    print("Median latency: ")
+    print(numpy.percentile(latencies_ns, 50))
+    print("90 percentile latency: ")
+    print(numpy.percentile(latencies_ns, 90))
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+
+    sut = mlperf_loadgen.ConstructSUT(
+        issue_query, flush_queries, process_latencies)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024 * 1024, 1024, load_samples_to_ram, unload_samples_from_ram)
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/tools/mlperf-trace.ipynb b/benchmarks/rnnt/ootb/inference/loadgen/tools/mlperf-trace.ipynb
new file mode 100644
index 0000000..ab834d1
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/tools/mlperf-trace.ipynb
@@ -0,0 +1,441 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tool to extract usefull information from mlperf trace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Figure size 3600x2400 with 0 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "# Ignore warnings\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import json\n",
+    "import os\n",
+    "import seaborn as sns\n",
+    "from operator import itemgetter\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "figsize=(10, 5)\n",
+    "font=10\n",
+    "\n",
+    "plt.figure(dpi=600)\n",
+    "plt.rc('xtick', labelsize=font) \n",
+    "plt.rc('font', size=font)\n",
+    "sns.set(font_scale=1.4, style=\"whitegrid\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def trace_to_df(fname):\n",
+    "    with open(fname, \"r\") as f:\n",
+    "        j = json.load(f)\n",
+    "    if type(j) == dict:\n",
+    "        j = j['traceEvents']\n",
+    "        \n",
+    "    result = []\n",
+    "    for item in j:\n",
+    "        name = item['name']\n",
+    "        if name not in [\"Latency\", \"Sample\", \"QuerySamplesComplete\", \"IssueQuery\"]:\n",
+    "            continue\n",
+    "\n",
+    "        args = item.get('args')\n",
+    "        d = {\"ts\": item['ts'], \"name\": name, \"dur\": item.get(\"dur\")}\n",
+    "\n",
+    "        if name == \"Latency\":\n",
+    "            d[\"issue_delay\"] = args[\"issue_delay\"]\n",
+    "            d[\"issue_to_done\"] = args[\"issue_to_done\"] / 1e3\n",
+    "            result.append(d)\n",
+    "        elif name == \"Sample\":\n",
+    "            if args:\n",
+    "                d[\"issue_start_ns\"] = args[\"issue_start_ns\"]\n",
+    "                d[\"complete_ns\"] = args[\"complete_ns\"]\n",
+    "                d[\"issue_to_done\"] = (args[\"complete_ns\"] - args[\"issue_start_ns\"]) / 1e3\n",
+    "                result.append(d)\n",
+    "        elif name == \"QuerySamplesComplete\":\n",
+    "            result.append(d)\n",
+    "        elif name == \"IssueQuery\":\n",
+    "            result.append(d)\n",
+    "\n",
+    "    df = pd.DataFrame(result)\n",
+    "    df = df.sort_values(by=[\"ts\"])\n",
+    "    return df\n",
+    "\n",
+    "BINS = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ts</th>\n",
+       "      <th>dur</th>\n",
+       "      <th>issue_delay</th>\n",
+       "      <th>issue_to_done</th>\n",
+       "      <th>issue_start_ns</th>\n",
+       "      <th>complete_ns</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>2.000000e+04</td>\n",
+       "      <td>10000.000000</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "      <td>10000.000000</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>4.894584e+07</td>\n",
+       "      <td>17.731682</td>\n",
+       "      <td>7.001508e+04</td>\n",
+       "      <td>6112.554491</td>\n",
+       "      <td>7.001508e+04</td>\n",
+       "      <td>6.182570e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>2.839099e+07</td>\n",
+       "      <td>25.578639</td>\n",
+       "      <td>9.666462e+04</td>\n",
+       "      <td>2254.077235</td>\n",
+       "      <td>9.666462e+04</td>\n",
+       "      <td>2.263719e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>4.102560e+03</td>\n",
+       "      <td>1.152000</td>\n",
+       "      <td>8.810000e+02</td>\n",
+       "      <td>2754.967000</td>\n",
+       "      <td>8.810000e+02</td>\n",
+       "      <td>2.780383e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>2.463025e+07</td>\n",
+       "      <td>3.974750</td>\n",
+       "      <td>5.806250e+04</td>\n",
+       "      <td>4100.473000</td>\n",
+       "      <td>5.806250e+04</td>\n",
+       "      <td>4.166623e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>4.881766e+07</td>\n",
+       "      <td>7.364000</td>\n",
+       "      <td>6.159800e+04</td>\n",
+       "      <td>6089.880000</td>\n",
+       "      <td>6.159800e+04</td>\n",
+       "      <td>6.155939e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>7.373552e+07</td>\n",
+       "      <td>27.441000</td>\n",
+       "      <td>6.835175e+04</td>\n",
+       "      <td>7337.257000</td>\n",
+       "      <td>6.835175e+04</td>\n",
+       "      <td>7.408272e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>9.832065e+07</td>\n",
+       "      <td>508.552000</td>\n",
+       "      <td>6.522433e+06</td>\n",
+       "      <td>22234.101000</td>\n",
+       "      <td>6.522433e+06</td>\n",
+       "      <td>2.414005e+07</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 ts           dur   issue_delay  issue_to_done  \\\n",
+       "count  2.000000e+04  10000.000000  5.000000e+03   10000.000000   \n",
+       "mean   4.894584e+07     17.731682  7.001508e+04    6112.554491   \n",
+       "std    2.839099e+07     25.578639  9.666462e+04    2254.077235   \n",
+       "min    4.102560e+03      1.152000  8.810000e+02    2754.967000   \n",
+       "25%    2.463025e+07      3.974750  5.806250e+04    4100.473000   \n",
+       "50%    4.881766e+07      7.364000  6.159800e+04    6089.880000   \n",
+       "75%    7.373552e+07     27.441000  6.835175e+04    7337.257000   \n",
+       "max    9.832065e+07    508.552000  6.522433e+06   22234.101000   \n",
+       "\n",
+       "       issue_start_ns   complete_ns  \n",
+       "count    5.000000e+03  5.000000e+03  \n",
+       "mean     7.001508e+04  6.182570e+06  \n",
+       "std      9.666462e+04  2.263719e+06  \n",
+       "min      8.810000e+02  2.780383e+06  \n",
+       "25%      5.806250e+04  4.166623e+06  \n",
+       "50%      6.159800e+04  6.155939e+06  \n",
+       "75%      6.835175e+04  7.408272e+06  \n",
+       "max      6.522433e+06  2.414005e+07  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = trace_to_df('/tmp/mlperf_log_trace.json')\n",
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoIAAAFKCAYAAACJoz5RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAstklEQVR4nO3deZxcVZ338Q8ECIQASVgNyjIIP8AoPkRGUfRBkSXgOiIMjAsoKIroIAgzKItsowODKCIoI4IzqICODgjigiwqKtCAEiA/MLI8EEFIAoJgB0ieP84tUhSVdHelu6vS9/N+vfp1U/eeunWqTi/fnHvOuSssWrQISZIk1c+K3a6AJEmSusMgKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1tVK3KyBJdRIRmwB3A/tn5nndrc3z9ULdIuJdwDeAjTLz0W7UYbAiYhpwC/DKzJzZ5epIHTEISmNIROxH+SO6fWb+psvVGZSIWAc4Eng7sBHwJHAD8MXMvLybdRurIuKjwJM9GETHAScAZ/V6CATIzJkRcTlwPPAP3a6P1AkvDUvqmogISo/KIcCVwMeAfwPWAy6LiM91r3Zj2keB/drsvxdYDfivUa3NYm8BtgK+2qXX78TZwDsj4qXdrojUCXsEJXVFRKwMfBeYDLwhM69vOnYacAFwZET0ZebFo1y31TPzr6P5mp2qetFWysz+ZT1XZi4C/rbsterYB4AbM/OPXazDUP0UmE8J1p/pblWkoTMISmNcRKwPnATsSulpexS4CTg8M2+rymwLnAhsB6wBPARcC3woM5+KiB2Bq4A3ZubVTefehDZjyiJii+p8OwGrA3cAJ2Xmd5uq9i5gGnBMcwgEyMxnI+LDVZ0/C1xcnXfY69F0OX0n4B3A3sB61XPvBA7LzNNaPtNXAL8DPpqZZ7EEETEJOB14J7AI+F/gC23KXV297x1b9p8H7JiZm7S8z38FngA+AWwKvBm4OiIOq15rS2AicBflEvt/Np3zHmDj6t+Ne4zem5mbLOVz3IbSU7sDMI5y6f7ozPxFU5n9KJ/jjpTL/O8FJgA/oXwfPbykz6l6/qrAbsB/tDm2CPhsZh7Xsv8e4OrM3K96vBLwL8D7gJcAT1Wfwecz83+anjeY708iYi1KuHsXsCHwCHAN8KnMfAAgM5+u2u+dGAS1HPLSsDT2fRfYEzifckmwEUS2AIiIdSm9GpsB/065PHseJaStPtQXi4itgN8CL6/OdxgwF7g4It7TVPSt1fab7c6TmY9RgtNWEbHZCNaj4QxgW0poPiYz7wJ+DbQr+x5gAXDhUl5/har+76X0bn4GmEpph2X1XuBwSvA6FPhTtf9QYCZlzNqnKIH+nIg4qOm5/wzcD8yqzvPeat+S3sdWwC+A/wOcAhxXvY+fRcQb2jzldGAbSoA/i9LOXx7Ee5oOrALcOIiyS3Is5b1fA3y8+vcs4O8bBQb7fRERq1fnORT4OSV0f4USolsvA/dRvk8nL0Pdpa6wR1Aaw6oeqR0oPRinNh1qHnv3WmAKsGtmNv8RPrbDl/0iMAd4VWY+Ve07MyJ+AnwuIi6oLkFuDTyWmfcu5Vy/q7ZbA7NHqB4NT1B6355p2vdN4KyI2DozbweIiBWBfYDLMnPeUl7/bcAbgCMz89+r554F/GyI76OdjYHNM/NPLfu3yMwnmx6fUb3fwylj2cjMH0TEicAjmfnfg3itk4BVgelVOCYivkEJWKcBr2opPxfYufHZVp/XxyNirSrcL8mW1XZZLgu/Bbg8Mw9cSpnBfl98ihJo92oZmnBSFfKb/RFYgTK+8bplqL806uwRlMa2pyg9VztGxJQllGn8cX5LNW6vY9VrvBm4CFg9ItZpfAFXUC6vbVEVXwN4fIBTNo6vMYL1aDinJQRC6fHrp/SaNewIvJiBJ1TsDiyk9IoB5ZI3cOZQ3ssS/KBNCKQRAiNi5YiYUr3fq4DNqsucQ1KNP9wVuLQRAqvXeYTSazy9GnrQ7OstAfsXlMvJGw/wcmtX2/lDrWeTx4CXVZd+X2CI3xd7Are1G5/a8v6a67zOMtRd6gqDoDSGVRMIjqSMvXooIn4ZEUdFxEuail1DuXx8LDA3Ii6NiAOrS2ND9VJKz8hxwMMtX42xX+tV28cZOOA1jv95BOvR8IIex8ycD1wC7NvUC/QeYB5w2QB12Bh4MDNbw+6dg3sLS9W2dzQi3h4RN1L+AzCX8n5Prg4POQgC61LG+WWbY3dU201a9t/X8rgRkgZ72bS1t20ojqG8z4yI2yLitIho7rEcyvfFZpTL7EOpc2tAlHqeQVAa4zLzdGBzyqWux4CjgTuqiRdk5qLMfDfwasr4rnWArwG3RkTjj+KS/sCNa3nc+J3yBWDnJXw1/rjeDqwVERstpfqvqLaNy4UjUY+Gp2jvm5T1Dd9QTWh4F3BRZi5YSr2HarDvq+EFdY2IHYDvU9ZhPAjYg/I+G2NCR+v3/bNL2D9QwHuk2g5lnN3zPp/MvJYS4N4P3EyZNHJ9RBxRFenk+2IwGnV+ZKmlpB7kGEGpBjLzbkrIOz0iXkxZu+/TwNVNZa4HrgeOiYgZwOXAgZQxYo1enUktp2693NcIbM9k5kBj4S4F9qX8sT6x9WBErEmZfXpT03IiI1GPgVxB6ZF8L7A+sCaDW2fvXmDniFijpVew3WXL+cDftdk/0OXUZntSln7ZJTOfWwImIt7Ypuxge64epgTLaHOsMabvniHUcWkaPYybUkJcs/m0tHlErAK8qPUkVS/uN4FvRsRqlO/jz0bEfzC074vZlAlTg7Ep5TOdNcjyUs+wR1AawyJiQvXH8DmZeT8l2EyqykxuM/j9pmo7qdreS+npaZ0l+tGWc/+ZMibtwIjYsE191m16+D3gNuBfWi7fNcamnUXpaTmp6dBI1GOpqnGDF1CC1geBP2TmYCYEXE75HfuRptddETi4TdnZwJbN9aqWbHndYOtJ+VwW0fR7vZrF+oE2Zf/KIHreqjGNVwBvbZ65XY21ez9lzb+HhlDHpemjBNnWySdQPp/WNv8QLT2CEbF28+NqMsgsymSX1Yb4ffFdynjDd7cp1/rzMh2YVYVQablij6A0tm0B/DwiLqaErn7KJIatKDNJofxBPzgivk/5g7sasD8lWHwXylIu1TkOqdZ0m02Zodk6zg5K8PkV8PuIOKcqux7l0vPWVEtvVOuvvYuyNMcvI+JcShiYTOkp/D/Aic3rv41EPQbpm5RlRHahjC8bjEur1/+3an2+2yjrFLabtHMu8EngxxHx9aqeB1XPWXMIr/dJ4KcR8V/V6xwIPAhs0FL2RuCjEXEsZcziE5l56RLO+xnK+/5lRJxJCWsHUv6TsOcg6zagzFwQEVdQLs8e1XL4P4GzI+J7lKWOtqFMYmm9FHtHRFxLWefwkarcAcAPM/OJqsxgvy9OoQwD+HZE7EL53pwEzKCMRbwGnlsY/f+yfN0NRXqOPYLS2Pb/KL1Zr6f0rJ1CmRn5wcxsDI6/hnJJeC/K0hpHUcLDmzLzt03nOoSyLt5BlEu591FC5PNkZlJ6dS6hXPY9k9JjtxJlfGJr2W0o68ztTFmn7RRKCHx/Zj6v/EjVYyCZeQvw++rhYJZcITMXUpaQuQD4J8rn/6cl1PWOqo5rUZZkeRvlUvRNrWWX8npXV+eeQhkG8AHK2ohfalP8eBYHx29V5ZZ03jsoSxDdTJl49FnK98ebqzF5w+lc4FURsWnL/nOAz1N6Bf+Dcil2Z0rPZrPTKTO6j6R8T+1GWSppn0aBwX5fVHeWeUN1fDfK5/gxyhqMz82gpsxCnkKZRS0td1ZYtMhJTpJ6R0S8nLLkyL2UW88tbe25URMRNwALMnMol2s1BNWl85mU5WqO7HZ9BiMiLgEWZuY7ul0XqRP2CErqKZl5K2WSSADfryYFdFVEvJLSizQcdwXRElS9qEcDH6kWQ+9pETGNMtTCW8tpuWWPoCQtQfWHfjplfOCLgE1b7twhScs1ewQlacn2pNzPdzXgHw2BksYaewQlSZJqyuVjOtDX1zce2I4yA3BJq+hLkiT1gnGU4S03TJ8+vb/5gEGwM9tRZjVKkiQtL14P/LJ5h0GwM38C2GKLLVhlleGd0Dhz5kymTRvsXY00GmyT3mJ79Bbbo/fYJr2lF9pjwYIF3HnnnVDll2YGwc48C7DKKqswfvz4YT/5SJxTy8Y26S22R2+xPXqPbdJbeqg9XjCczVnDkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNeWdRXrY439dwJP9z3S7GsNiwviVWGP14b0dnyRJWjYGwR72ZP8zXHnDfd2uxrDYabuNDIKSJPUYLw1LkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSpplbqdgUaImIiMAvYENguM29sOvY+4ChgE2A2cHxmXtjy/JWB44H3A5OAG4BPZOYtLeU2AL4I7AYsAn4I/HNmPjIS70uSJKlX9VKP4HG0CaYRsSdwPvB9YAbwM+DbETGjpegXgIOBY4G3AwuAKyNiatO5VgKuAF4OvA84AHgtcElErDDM70eSJKmn9USPYERMAw4CPgl8teXwCcDFmfmv1eOrImIr4LPAj6rnb1g9/+OZeU617zfA3cA/A0dUz30XsA0wLTNvq8rNAX5FCZmXj8T7kyRJ6kW90iN4JvBl4M7mnRGxKbAl8J2W8t8CtouIdavHuwDjgOcuF2fm45TLvrs3PW934NZGCKzKXQfc21JOkiRpzOt6EIyI9wIvBU5sc3irant7y/5GkIumcg9l5tw25baIiBWbyrWeq1Fuy6HUW5IkaXnX1SAYEWsBpwBHZOYTbYpMrraPtuyfX22nNJVrLdMotzIwcRDlprTZL0mSNGZ1e4zgicBdmXlBl+vRkZkzZ47Iefv6+gBYbc11mTNnzoi8xmibO3cC99/9cLer0bFGm6g32B69xfboPbZJb+nl9uhaEIyIl1EmeOwcEZOq3Y2eu4kRsQaLe/4mAQ82Pb3RUziv2s6vyrSaDDwNPDGIcvPa7F+qadOmMX78+KE+ban6+vqYPn06AA/Ne5KpU58c1vN3y9prr8P6m2/U7Wp0pLlN1H22R2+xPXqPbdJbeqE9+vv7l9h51c1Lw5tTguhVlIA2H7i0OnYV8AvgjurxVi3P3braZrW9A1gvIlov724N3JmZC5vKtZ6rUW5WB+9BkiRpudXNIPhL4I0tX4dWxw4CDsjMuykBbe+W5+4D3JCZjWuNPwEWAns1ClQLVL+V5y8Jcznw8mr5mUa511AWqnbpGEmSVCtduzRc3cnj6uZ9EY1JwPQ13VnkGODCiJgN/JSyWPQuwB5N53ogIs4GPh8Rz1CWgzkcWAE4veklvgf8HvhuRPwr5f2fAvyaak1CSZKkuuj68jEDycyLgf2BPYEfA7sC+2Zma3A7FDiLMgHlEmA14M2ZOafpXM9Qbi03E/hv4BvAb4C3ZeaiEX4rkiRJPaXbs4afJzOvpvTite4/n3KbuaU992ngX6qvpZV7kBdeapYkSaqdnu8RlCRJ0sgwCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmhhwEI2LXiFhhJCojSZKk0dNJj+CPgPsj4pSI2Ga4KyRJkqTR0UkQfAfwK+Bg4KaI+H1EHB4RU4e1ZpIkSRpRQw6CmXlJZu4FrA8cCDwMfA64NyJ+EhHviYgJw1xPSZIkDbOOJ4tk5uOZeW5m7gRsDBwFrAecDzwUEd+MiJ2GqZ6SJEkaZsM1a3gcsDIwHlgBeAp4M/DTiLg5IqYN0+tIkiRpmKzU6RMjYi1gL+A9wOuAZ4DLgH+ptguBtwFfAL4BbLeslZUkSdLwGXIQjIh3UMLf7sCqwA3AJ4BvZ+a8luI/iIh1gK+0Oc8/AJ8EtgQmAg8A3wdOyMzHmsrNAE4Ctq7KnJ6ZZ7Q53+GUCSwbALcBR2bmlS1l1gBOAfas6n4VcEhm3jOkD0GSJGkM6OTS8P8Arwa+CGydma/OzDPbhMCG3wMXtNk/BbgW+BCwW3W+DwAXNwpExPbAJcDNwAxKz+LpEXFQ84mqEHgycCawB3AXcFmb5W2+TemlPATYG5gKXOnkFkmSVEedXBreBbgyMxcNpnBmXg9c32b/f7bsujoi/gZ8NSKmZuYc4Bjgpsz8YFXmqojYCDg2Ir6WmQsjYjzwGUpP4akAEXENcCvwacrlayLi1ZSQuEdmXl7tuxWYDexHm15LSZKksayT5WN+NtgQ2IFHqu0qVcB7E3BhS5lvUS7/bls9fi2wFvCdpjo+C1wEzGi6C8ruwGPAFU3l7qOsibj78L4NSZKk3tfJLea+EBF3LeX4nRFxyhDONy4iVo2I6ZQewEuqMXubAasAt7c85bZqu2W13ara3tGm3ERgw6ZyszJzYZtyWyJJklQznVwa3oMX9tI1uxB4N/CpQZ5vLqVHD0pv3b7VvydX20dbys+vtlOayvVn5lNLKXd/Va71XI1yU9rsH9DMmTM7edqA+vr6AFhtzXWZM2fOiLzGaJs7dwL33/1wt6vRsUabqDfYHr3F9ug9tklv6eX26CQIvgS4ZynH763KDNaOwARgGmWs36URsXMH9Rp106ZNY/z48cN6zr6+PqZPnw7AQ/OeZOrUJ4f1/N2y9trrsP7mG3W7Gh1pbhN1n+3RW2yP3mOb9JZeaI/+/v4ldl51EgT/Amy6lON/R1lQelAy85bqn9dFRB9wI/BOFl8SntTylEZPYWOW8nxgfESsmpl/G6BcuyQyuamMJElSbXSyfMzPgQ9Xs3efJyI2AT5clenELZSFqF9Kmc27gMVjABu2rrazqm1jbGC7co9T1h5slIumySPN5WYhSZJUM50EwWMoPYkzI+KLEfGh6utLlDUDVwSO7rA+21fP/2Nm9lMC5V4tZfYBHgRuqh5fR5kNvHejQESMq553RdMM58spvYu7NpV7CbBDdUySJKlWhnxpODPviojXURZvPqTl8DWUO3XkQOeJiB8DV1Jm7f4NeCVlgsnvgR9UxY4Hro2IcyiLUr8OOBA4uDH7NzP7I+JE4OSIeJgSEA+gzDpuTDwhM38bEZcBX4+IwyiXuI8H7gPOG9qnIEmStPzr6F7DmXkbsGN1+7i/q3bPzsy5QzjN9ZRb1TXGG94DnA2clpkLqtf5dUS8nXLXkPcBc4BDM/PslvqcGhEAHwfWp4TLPTLzdy2vuQ9wKmXx6PGUW8y9OzPHxowMSZKkIegoCDZk5iMsXgR6qM89mkFcQq7uAjLgpdvqriKnDlDmccoYxg8PspqSJEljVkdBsBqDtyulN3Ay0DoBY1FmnrCMdZMkSdIIGnIQjIhXAd8DXswLA2DDIsAgKEmS1MM66RH8CrAa8A7gF5n56HBWSJIkSaOjkyD4CuDTmXnpcFdGkiRJo6eTdQTvZ8mXhCVJkrSc6CQIfg44MCLWHO7KSJIkafR0cml4CvBX4A8R8V3g/wHPtpRZlJmnLGvlJEmSNHI6CYKfa/r3QUsoswgwCEqSJPWwToLgpgMXkSRJUq/r5F7D945ERSRJkjS6Or7FXERsDuwIrAdckJn3RMQqwAbAg437BUuSJKk3dXJnkRWBs4EPUpaRWQT8GrgHWAW4FTge+I9hq6UkSZKGXSfLxxwFfAA4GtiepjUFM/MJyu3n/mFYaidJkqQR00kQ3B84NzNPBv7Q5vitwObLVCtJkiSNuE6C4IuB65dy/Clgjc6qI0mSpNHSSRB8ENh4KcenA84sliRJ6nGdBMHvAR+pZg03LAKIiBnA+4CLhqFukiRJGkGdBMHjgPuAm4ELKCHwqIj4DfBD4HfAvw1XBSVJkjQyhhwEM/MvwGuBk4H1gb8BOwATKSHxDZn51DDWUZIkSSOgowWlM/NvlCB48vBWR5IkSaOlk0vDkiRJGgM6ubPIuYMotigzP9hBfSRJkjRKOrk0/CaqWcJNxgEvqrYPA39dxnpJkiRphA05CGbmJu32R8TKwIeBfwZ2XqZaSZIkacQN2xjBzHw6M78M/AT48nCdV5IkSSNjJCaL/A54wwicV5IkScNoJILgzsCTI3BeSZIkDaNOZg0fs4RDkyg9gdsCn1uGOkmSJGkUdDJr+Lgl7J8PzAYOAs7ptEKSJEkaHZ3MGnYRakmSpDHAUCdJklRTnYwR3KiTF8rM+zp5niRJkkZGJ2ME7+GFdxYZjHEdPEeSJEkjpJMgeADwceAlwLeAO6v9AewD3Ad8CVg4HBWUJEnSyOgkCL4IGA+8NDPnNx+IiGOBXwEbZOa/DUP9JEmSNEI6mSxyEPC11hAIkJlzKUvHfGRZKyZJkqSR1UkQXBuYuJTjq1dlJEmS1MM6CYK/AT4REdNbD0TEq4BPAL9d1opJkiRpZHUyRvBjwNXA9RFxA3BXtX9zYDtgHnDIsNROkiRJI2bIPYKZeTvwcsrM4EnAntXXJOCLwMsz87bhq6IkSZJGQic9gmTmQ8Ch1ZckSZKWQx0FwYaI2BxYD5iZmY8N8bnvBv4JmA5MAWYDZwFfzcyFTeVmACcBWwMPAKdn5hltznc4cDCwAXAbcGRmXtlSZg3gFEoP5qrAVcAhmXnPUOouSZI0FnR0r+GI2Dci7gNmAddSwhwRsU5E3BkRew3iNIcB/cCngLcAP6Bcbv580+tsD1wC3AzMAL4BnB4RB7XU53DgZOBMYA/KuMXLImKbltf8NvA2yhjGvYGpwJURMWHQb16SJGmM6ORew+8C/hv4KXA6cGrjWGY+EhF3AO8DLhrgVG/NzIebHl8VEROBj0XEZzKzHzgGuCkzP9hUZiPg2Ij4WmYujIjxwGcoPYWnVnW8BrgV+DSwV7Xv1ZSQuEdmXl7tu5XSE7kf8JWhfhaSJEnLs056BD8N/CwzdwXOb3P8t0BrT9wLtITAhpspl2ynVAHvTcCFLWW+Rbn8u231+LXAWsB3ms79LCWIzoiIFarduwOPAVc0lbuPcieU3QeqryRJ0ljTSRDcCvj+Uo7/GVi3s+rwesryM38GNgNWAW5vKdOYkbxlU30A7mhTbiKwYVO5Wc3jD5vKbYkkSVLNdDJZ5K8s/c4imwGPDPWk1WLU+wOfzcxnI2JydejRlqKNW9tNqbaTgf7MfGop5e6vyrWeq1FuSpv9A5o5c2YnTxtQX18fAKutuS5z5swZkdcYbXPnTuD+u9t1Ai8fGm2i3mB79Bbbo/fYJr2ll9ujkyD4c2C/iPhi64GImAocCPzvUE4YERsA3wOup2mySK+bNm0a48ePH9Zz9vX1MX16uWnLQ/OeZOrUJ4f1/N2y9trrsP7mG3W7Gh1pbhN1n+3RW2yP3mOb9JZeaI/+/v4ldl51OkbwRcCNwEeBRcDuEfE5ygSNhcBnB3uyiFgL+BHwJPC2zHy6OtTo0ZvU8pRGT+G8pnLjI2LVQZRrPVej3Lw2+yVJksa0Tu4schfwOuBB4DhgBeCTwBHALcAO1SSMAVXh7RLKWoS7ZebcpsOzgQUsHgPYsHW1nVVtG2MD25V7nLL2YKNcNE0eaS43C0mSpJoZUhCMiHHV8i0PZeYuwDrAq4HtgfUzc6fMvHOQ51qJMrP3FcCMzLy3+Xi1fMzPqZZ/abIPJYTeVD2+jjIbeO/melbPuyIzF1W7L6f0CO7aVO4lwA7VMUmSpFoZ6hjBFSk9dUcCp2XmfOCGDl/7TOCtlJ7ECRHxmqZjt2fmX4DjgWsj4hzgAkpP5IHAwY3Zv5nZHxEnAidHxMOUgHgAZdLKvo0TZuZvI+Iy4OsRcRjQOP99wHkdvgdJkqTl1pB6BKvxe3Mo4wKXVaNn7t+BX7d8bVu93q+BtwPbAT+mBLxDM/PslnqdChwFfJwy3nBLysLRv2t5zX2AH1IWj76Y0rP45swcGzMyJEmShqCTWcPfoMwaPisz/9bpC2fmJoMsdzmDuHRbhcFTByjzOPDh6kuSJKnWOgmCdwLjgFkRcT7wR6B1DT8yc6BbzEmSJKmLOgmC/93076OXUGYRA99rWJIkSV00qCAYEV8Czs/MPuCN1e6JlJ7AZ0eobpIkSRpBg+0R/BjwG6AvM6+JiLUp9wPeOTOvGbHaSZIkacR0cmeRhtaFmSVJkrQcWZYgKEmSpOWYQVCSJKmmhjJr+O8i4u+rf69VbbeMiCfaFc7M65epZpIkSRpRQwmCn62+mp3RptwKlOVjxnVaKUmSJI28wQbB/Ue0FpIkSRp1gwqCmXn+SFdEkiRJo8vJIpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1dRK3XzxiHgpcDjwGmAaMCszp7UpNwM4CdgaeAA4PTPPaFPucOBgYAPgNuDIzLyypcwawCnAnsCqwFXAIZl5z/C9M0mSpN7X7R7BlwF7AH8Abm9XICK2By4BbgZmAN8ATo+Ig1rKHQ6cDJxZnfMu4LKI2KbllN8G3gYcAuwNTAWujIgJw/SeJEmSlgtd7REELs3M/wWIiPOAV7UpcwxwU2Z+sHp8VURsBBwbEV/LzIURMR74DKWn8NTqfNcAtwKfBvaq9r2aEhL3yMzLq323ArOB/YCvjMi7lCRJ6kFd7RHMzIVLO14FvDcBF7Yc+hbl8u+21ePXAmsB32k697PARcCMiFih2r078BhwRVO5+4BfVcckSZJqo9uXhgeyGbAKL7xsfFu13bLablVt72hTbiKwYVO5WW0C6G1N55IkSaqFbl8aHsjkavtoy/751XZKU7n+zHxqKeXur8q1nqtRbkqb/Us1c+bMoT5lUPr6+gBYbc11mTNnzoi8xmibO3cC99/9cLer0bFGm6g32B69xfboPbZJb+nl9uj1INjTpk2bxvjx44f1nH19fUyfPh2Ah+Y9ydSpTw7r+btl7bXXYf3NN+p2NTrS3CbqPtujt9gevcc26S290B79/f1L7Lzq9UvDjR69SS37Gz2F85rKjY+IVQdRrvVcjXLz2uyXJEkas3o9CM4GFrB4DGDD1tV2VrVtjA1sV+5xytqDjXLRNHmkudwsJEmSaqSng2Bm9gM/p1r+pck+wIPATdXj6yizgfduFIiIcdXzrsjMRdXuyyk9grs2lXsJsEN1TJIkqTa6fWeRCSxetmVjYM2I2LN6fENm3gscD1wbEecAFwCvAw4EDm7M/s3M/og4ETg5Ih6mBMQDKLOO9228Xmb+NiIuA74eEYcBf6nOfx9w3oi+WUmSpB7T7cki6wEXt+xrPN4fOC8zfx0Rb6fcNeR9wBzg0Mw8u/lJmXlqRAB8HFifsiTMHpn5u5bz7wOcSlk8ejzlFnPvzsyxMStDkiRpkLoaBKv7+7aO12tX7nIGcem2uqvIqQOUeRz4cPUlSZJUWz09RlCSJEkjxyAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaZW6nYFVA8LFy7ioXlPdrsaHVltzXWfV/cJ41dijdVX6WKNJEkaHgZBjYr+p5/lut/P6XY1OjJnzhymTl0cBHfabiODoCRpTPDSsCRJUk0ZBCVJkmqqdpeGI2Jz4AxgB+Ap4DvAkZm5fA5gkyRJ6lCtgmBETAKuAu4F9gTWA04D1gX+sXs1kyRJGn21CoLAh4HJwCsz8xGAiHgGuCAiTsjM27paO0mSpFFUtzGCuwNXNkJg5XtAPzCjO1WSJEnqjrr1CG4FnNu8IzP7I2I2sOUQzjMOYMGCBcNYtcX6+/sBeObpBay04sIReY3R9uwzTy+372XVlVd4Xt0XLOjngYdGpu1H07gVV+DZhYu6XY0hGz9xCg889Ojz9q22ykqsPmHl7lRIz/3OUu+wTXpLt9ujKa+Maz1WtyA4GXi0zf75wJQhnOdFAHfeeecwVOmFZs6c+dy/NxtKrXrYo3++Z7l9L5tNWZMyr6j48wN/7F5lBMCDT8zrdhXUpPl3lnqDbdJbeqg9XgTMbt5RtyA4XG4AXg/8CXi2y3WRJElamnGUEHhD64G6BcH5wKQ2+ycDswZ7kunTp/cDvxymOkmSJI202e121m2yyB2UcYLPiYjxwGYMIQhKkiSNBXULgpcDO0XE2k373gmMr45JkiTVxgqLFi1/swY7VS0oPRO4BziBxQtKX5mZLigtSZJqpVY9gpn5KPAm4Angf4AvABcCH+hitSRJkrqiVj2CkiRJWqxWPYKSJElazCAoSZJUUwZBSZKkmqrbgtI9KyI2B84AdqDcz+w7wJGZ+WRXKzbGRMRLgcOB1wDTgFmZOa1NuRnAScDWwAPA6Zl5RptyhwMHAxsAt1Ha7MqRewdjR0S8G/gnYDrlFo+zgbOAr2bmwqZytsUoiYh/AD5Juff6RMrn/X3ghMx8rKmcbdIFETGRsubthsB2mXlj07H3AUcBm1B+lo7PzAtbnr8ycDzwfsrNFW4APpGZt4xC9Zd7EbEf8I02h87MzI81lVuufj7sEewB1bI2VwFrAHsChwH7AOd2sVpj1cuAPYA/ALe3KxAR2wOXADcDMyg/+KdHxEEt5Q4HTgbOrM55F3BZRGwzYrUfWw4D+oFPAW8BfgB8Cfh8o4BtMeqmANcCHwJ2A75IWVXh4kYB26SrjqNNB05E7AmcTwntM4CfAd+uAkmzL1CCx7HA24EFwJURMXUE6zwW7QZs3/R1auPA8vjz4azhHhARRwLHABtn5iPVvn2BC4BpmXlbN+s3lkTEio3epog4D3hVa49gRPwImJKZr27a9zXgrcCGmbmwuiPNQ8DXMvOIqsw44FZgZmbuNSpvaDkWEetm5sMt+04DPgJMysx+26L7IuJDwFcpn/cc26Q7ImIa8BtKj+1XaeoRjIg7gFubP9eI+Anl5+jvq8cbAvcCH8/Mr1T71gDuBs5ttJOWrKlHcN3G3+o2ZZa7nw97BHvD7pRFrZu/sb5H6S1p/R+dlkHzJcd2qh/QN1HWl2z2LUr3/bbV49cCa1Eu4TfO/SxwETAjIlYYrjqPVa0hsHIzsCowxbboGY3fS6vYJl11JvBl4M7mnRGxKeVS/ndayn8L2C4i1q0e7wKMo6ntMvNx4IeUv0FaRsvrz4dBsDdsRctlyszsp4zz2LIrNaqvzYBVeOFl40avbKM9GvesvqNNuYmUMTwautcD84A/Y1t0TUSMi4hVI2I65WrFJZl5D7ZJV0TEe4GXAie2Odz4rJfUJtFU7qHMnNum3BYRYR4YvJkR8WxE3B0Rx0ZE43L9cvnzYcP3hsnAo232z6eM2dHomVxtH23ZP7/aTmkq15+ZTw1QToMUEa8C9ge+UP3v2LbonrmUSWs3An8C9q322yajLCLWAk4BjsjMJ9oUGUqbtJZplFuZEkC0dH+ijK/cjzJO8PvA0cB/VseXy58PZw1L6rqI2IAyHOJ6miaLqGt2BCZQZtZ/Brg0Inbuao3q60Tgrsy8oNsVqbvM/DHw46ZdP42Ix4DjIuKELlVrmdkj2BvmU6byt5pMuUym0dP4H9mklv2N/+nNayo3PiJWHaCcBlD1ePwIeBJ4W2Y+XR2yLbokM2/JzOsy82vAO4E3VlvbZBRFxMuAg4CjI2JStcJEo+duYjXZYyht0lqmUe5poF1vowZ2UbXdluX058Mg2BvuYPGYAeC5QaebUdaM0uiZTVlSYauW/VtX20Z7NMZ2tCv3OGXtKA2g+kV4CbAesFvL+CXbojfcAiykjFGzTUbX5pQrd1dRwsN84NLq2FXAL1j6Zw2Q1fYOYL2IaL3suDVw50AT6TQoy+XPh0GwN1wO7BQRazfteycwvjqmUVJN0vk50Dp9fx/gQeCm6vF1wGPA3o0C1fT/vYArMtN1mQZQDbC+CHgFMCMz720+blv0jO0pfyv+aJuMul9SemObvw6tjh0EHJCZd1MCxt4tz90HuKFpdv5PKIG+eYmZiZRlTfw707l/BBYBfcvrz4djBHvDV4FDgP+txhmsB5wGXJiZbRc9VmciYgKLl0rYGFizWowVyi/Neykr718bEedQ1nJ8HXAgcHDjf83VGncnAidHxMOUH/ADKL24+6LBOJPyR+gIYEJEvKbp2O2Z+Rdsi1EVET8GrqTMXvwb8ErKgt+/pyz4DbbJqKmWFLu6eV9EYxIwfU13FjkGuDAiZgM/pSwWvQtloeLGuR6IiLOBz0fEM5Q1BQ8HVgBOH7l3MXZUPx8/B2ZSQvUM4KPA1zPzj1Wx5e7nwwWle0REbEG5q8LrWXyLuSO8xdzwiohNKAuotrN/Zp5Xldudsur7VsAcykzWL7U53+GUEL8+5Y/nEd5Ca3Ai4h5KGG/njZl5dVXOthgl1X9E3w5sWu26hzKJ57QqmDfK2SZdEhE7Ui4Lt95i7v288BZz32l57srACZRZr2ux+BZzN49G3Zd3EXE6Jfy9mNKRdhfVnUOqlQ4a5Zarnw+DoCRJUk05RlCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNXU/we2dyrPLzUr8gAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApsAAAFKCAYAAABSGJRzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA3OklEQVR4nO3debgcVZn48W8EuRDZQtiMIy4sr2DUGQIjICiKigFFHFnEUQSF0ZHFBRQXREDEhYyiAu4KjiCL/lQQxAUQHRDBwAAB8oIIZDAjsoRFYS6Y5PfHqSZFc29yb9+u23f5fp4nT6er3q46dbq679unzjk1ZenSpUiSJElNeEqvCyBJkqSJy2RTkiRJjTHZlCRJUmNMNiVJktQYk01JkiQ1xmRTkiRJjTHZlMagiDg6IsbUvGQRsTQivtLrckgjERFTIuLaiPhEr8syFBFxTkSc3etySCOxcq8LIE0Ww0ge92+0IGNEREwFPgj8KjN/1ePiNCIidgQuAfbJzDN7XJwhiYinAe8H9gI2Bh4DrgW+BpyemWPqR1AH9gE2Ab7Q64IM0aeA30fEizLz2l4XRuqEyaY0et7a9vzfgG2At7ctvxz4LvDp0ShUD00FPl79/1c9LIcqEbEBcBGwOfA94EvAasC/AP8JzI6It2bmkt6VcsQ+AHw/M+/pdUGGIjOvjojfA4fz5O8QaVww2ZRGSWZ+t/48Il4J/HP78pq/N18q6QlOoySab8jMc2vLvxARJ1ASnv8GThjNQkXEKsCSzBzRZyIi/gn4R+DIbpRrFJ0FHBsRB2Xmg70ujDRcJpvSGBQRRwMfz8wptWW3A/MpLZ5zgOcDtwKHZubFEbE7cCywGXAjcGBmzm3b7mbAccBOwNOAm4BPZub3h1G2vSktks8FEjgiMy9si1mritkD2BC4E/g2cHxmLo6IZwO3VeEfj4hWC+dpwH8A1wF7ZOYPqu1Fdex/yMxNa/v5T+Clmfms2rKtgWOAlwCrAHOBj2XmJW1lfDrwCeC1wDTgj8AXM/PLtZgdKZfB3ww8B3g3sC5wGfDOzPzDUOutts3VgaOBNwIzgAeBG4CjMvPXVcwmwPHADsA6wL3Ab4GDM/N/a/W3f2ae2rb9pcAxmXn0MI91G2Bn4FttiWbLh4HXAx+KiJMy85GGyrEjpc7fAmxKafl/BvBPEXEF8I3MPLRtX9OB/wU+n5lHDFD2lt2BxcDFba8/mrbPW7V8P8p5+5zMvL1atiXlM7Q1sAZwF/Br4N8y85EqZgpwMOXqxaaU9/g8ymflnrZ9vIpSt1sBU4CbgS9n5jdqYb+gfOZ3Bs5ZzvFJY5IDhKTx5bmUy5vnAx8C1gbOjYg3A18EzgCOquLOiYiVWi+MiM2B3wEvAD4LHEZJYs6JiLcMcf8vAb4MnA18FFgVOC8itq/tZzVKsrAfpTvAwZQ/7kcDX63C7gb+vfr/DymXB99arZ8HLAJeWtvvS4ElwCZVwtKyA+UPfWvfLwN+Q0nQjgWOAPqAn1dJTCtufeAK4DXAKcB7qv2eEhEDtXp9kHIpeQ6lD902wOmD1tLyfRk4hHLc7wY+Q6mPF1VleyrwM2B74OQq5hRgA0pyOizDONbXVY/fGWg7VaviGZS63a7BcrR8hJKQf4HyPi4EfgTsHRHtDSV7A08drOw12wE3tpLCDo5hPUritzHlM3QwcCowk/LjreXLwOcon7f3UPq77gFcEhGr1rb3Vsp7vUG1vQ8CVwK7tu36RuARyudPGnds2ZTGl00pLXm/AYiImyh/rL4FbJ6Zt1XL76ckbi8Hflm99guUP9hb1f7YnhwRPwc+HRFDGfwxE9guM39b7edU4BZKa2sr4Xwf8Dxgy8ycXy37WkTcBhwXESdkZkbE9yl/lK8boIvBZTwx2dwB+CmwY7X8rIh4JvAs4JPVa6ZUx/xfwKtax1KNoL+G0lLYSpKOoyShL8jMu6tlX4mIrwMfqVru7q/tf1XgRZn5aLXNRZRLyzMzc94K6qzda4GvZ+b7B1m/BeXHwp5tLc7HDXM/9dcN5Vi3qNYtbxBKa90WlL6dTZSjZQ3KOf231oKI+A5lgM+rgQtqsW8BrsnMG1ZQhudRWro7tR0l2d45M39fW95qmScitgPeCbwtM79TW34h5YfQvpTPw5rAScDVwA71BLg6lx+XmX+PiP9h2XskjSu2bErjy82tRLPyu+rxV61Es235cwEiYh3glZQWyadFxLqtf8CFlMuUmw1h/79vJZoAmXkvpbXrJRExrVq8FyXhu6dtP62kd8ch7Oc3wAury/FQEsyLKS1jrSR0h1oslJbBqMozvbbfNSmtUS+OiKnVH/I9KK3DS9vK+HPKgJgXt5XnO61Es22fzx3CsbR7oCrLMwZZ3+qTt3M1MrxjwzzWNarHh5azyda6NZYTM9JytHynnmhWfkG5XP74QJmIeC6wLWUA04pMp7Sad+qB6vG1VQv0QPYC/gpc2Hac8ymX3F9exb2acm5+ur2ldZAffYsoXTikcceWTWl8WVB/kpkPlO6M/E9bXOuPYisB3ITSH+zo6t9A1qf0wVyeWwZYdnP1+CzKH8TNKInf3QPEtvazIr+h/BjePiKuq7b9a2B1YM8qZgfgL7XW01ay/M3lbHc60E+pl7fz5JkABivjgrbnrYRlGsP3AUrf1AURcQ0l2f/PzEyAzLwtIj5HmX7oLVUr73nAd6vkfjjWY+jHWk8k7x8ktpVk/qXBcrTc2h5Q9ff9LnBQRKyRmQ9RWjUXU7qXDMWUFYcM6lLg+5SWzPdHxKXAucAZtcR4M8p5etcg22gd58bV41BbxqcA433aKU1SJpvS+LJ4mMtbf1hbVzE+zxMvP9YN93LwYJ5CaYX81CDr/ziEbfye0kftpZR+qQ9RLoWvARxdtdTuQGlBre8XSl/WwS6V3l1tD0py8q1B4tovx66ofocsM8+JiN9QBtu8GjgU+GBE7JeZZ1Qxh0XEt4Ddqpj/AI6MiJdl5o0MknTU++hWWnUylGO9kTKA5oXU+sG2eWH12HoPmyhHy2D9Kr9DSdj/hZK0/yvwi8z88yDxdfcw8A+EwZK4JxxH1eK4Z0T8M6U7xKso/TE/HBHbZOZfKMd6L/CmQbbZacvqNJYNqpPGFZNNaXJoJQd/z8xfLjdy+TYdYFmrRfGO6vFWYI0h7GfQVprMfKwaefxSYC3g8qpV6wrKlFCvp/Rf+3rtZa2WsIeWt++IuJuSvK48wrroWJUYfRX4akSsTekecAylC0Ar5gZKAvapiHghJYF+H3AgyxKWtds2/ay258M51vMog3L2ZYBks0og38yy0dc0VI7lysx5EXE18Naqz/JmlLobipsoswq0WwQQEWu39RttP45WGa6kDOQ5KiJmU37AHUjpP3wrJQm9IjP/upyytM7XmZRL7IOqBkQ9k8F/KEpjmn02pUmganG5BDhwoL6C1SjbodgqIratvW46JQG5PDNbicdZwNYRscsA+1kjIvqqpw9Xj4Ndiv4NMIvyh/vX1XE8Qmn1PILSqlhPiuYCf6Bc3nxSn8LWMWbmYsql0N0j4kWDxTUhIlaq9UOlKs/9lBartauYNQcYbX0TpaVv7eo1D1Ja6V7aFvfutm0P+Vgz8wpK/8n9I6J9NDSURGoz4DOt+S6bKMcQnUbp+/hBShL7wyG+7jJgi2rGhLpW4vf4cVT9Zd/WVs5p7YN3KAN8YFnCfRblb+tR7Tuv3v/W+f5zSv/cD7WXZ4B9bEEZpHb5wIcljW22bEqTx79T/theV40AvpXSf+zFlD9mmwxhG/OAn0TElyh/5P+Ncmn7w7WYEyjT6Pw4Ik6jJIGrUVpw9qRMvXR7NU/jDcCbIuJmyqXH2zKzNbjpNyybxqmeVP6akmw+SG3kdGYuiYh3UPpA3lhdhr6TMl3QyyjJaWtwxocoA5V+W9XFDZSk9x+BN1D+sDdhDeBPEfGDquwPUqazeQ1lZDLAKyizBHyf0od2CmVqnzUoiUzLNyiJyjcoCfhLGXiQ13COdV9KF4hzI+IMynuwKuWS9csog3Y+37b9JsqxIt+jTEP1RuDUYUxl9GNKK+grKIOVWn5O6Zf7zSiT1y+m9C29G9ioFvc2Sn/RH1I+P6tRbi/bSqbJzF9HxMnAB6oW6Z9R+glvQhkkdVRV5gcj4j2UbgW/r+r7Xsr8uc+g1HnLqyg/Nn42xOOUxhRbNqVJohqAshVlQMO+LJvDcWXgY0PczGXVa/amTCXUD+zemoy82s8jlKTiM5TE40TK5dnNKRN61/vWvQO4ndIn8Xssm3sTyiTmfwf+j3LJsqU1Evyy9tsmVuXYhnJZ+t2UBO7twH1VeVpxf6Ek2d+g9FM8iXKJekPK/KNNeZhS7y+g1PmJlPfk8Gr/UJLQnwK7UBKqT1ASzt3bpkI6ljIYag/KHI0rAbPbdzicY83Mu6rYY4B/okyX9QVKovnxzHxCS19T5ViRauqkn1ZPhzIKvfW6ayktkXu1LX+MkvDeSqnvQ6tyntS2iUsp5+JelHr5COV8fkXtRxKZeTDl3F6H0iL8aUrf27OpTShfTYT/Wsr5+RFK/W1L6dJQtxfww8x8AGkcmrJ0qYPbJEkDq7pdXE5pnNg2M+/scZEAiIhzKD8sntX+o2MFr9uH0tf3WR2M7h91Ue5Y9HtgVmZe0+vySJ2wZVOSNKjM/BPlMv9UytyRa/e2RI/fjWg3ypRRQ040K2dSWjDf2+1yNeTDwPdNNDWe2bIpSRoXIuI5lD6ub6dcbt50rLS0ShqcA4QkSePFy4BvU25isJ+JpjQ+2LIpSZKkxtiy2YG5c+f2AVtT7tE72J1FJEmSxoKVgKcDV82aNat/tHdustmZrVk2/YokSdJ40H6b31FhstmZ/wXYbLPNWGWVVRrbybx585g5c2Zj2x8vrIfCeiish8J6KKyHwnoorIeivR4effRRbr75Zqjyl9FmstmZxQCrrLIKfX19K4odkaa3P15YD4X1UFgPhfVQWA+F9VBYD8Ug9dCTrn/OsylJkqTGmGxKkiSpMSabkiRJaozJpiRJkhpjsilJkqTGmGxKkiSpMSabkiRJaozJpiRJkhpjsilJkqTGeAehMWz1tdblrvse7nUxumJq38qs8bTmbu0pSZLGJpPNMWzx0ilcdNWCXhejK3baeiOTTUmSJiEvo0uSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGrNyr3YcEXsC/wrMAtYBbgW+DHw1M5dUMacCbxvg5Xtm5vfbtnc4cBCwIXADcERmXtQWswZwArAHsCpwCXBIZt7etQOTJEnS43rZsnkY0A98AHgt8CPgi8Bn2uL+CGzb9u/iekCVaB4PnAzsCtwCnB8RL2rb1veA3YBDgL2BGcBFETG1WwclSZKkZXrWsgm8LjPvrj2/JCJWBw6OiCMzs79a/khmXjHYRiKiDzgSODEz51TLLgWuBz4K7FUtezElEd01My+oll1PaVHdDzilmwcnSZKkHrZstiWaLddQLm+vM4xNbQesBZxZ2/Zi4GxgdkRMqRbvAjwAXFiLWwBcVq2TJElSl/WyZXMgOwD3AX+pLds4Iu4HngbMAz6dmWfV1m9ePd7Utq0bgNWBZwB3VnHzW/1B2+J27krpJUmS9ARjJtmMiK2A/YFjqpZJKC2dV1ESwrWAA4AzI2K1zDy1ipkG9GfmI22bXFQ9rkNJNqcB9w+w60UMryX1cfPmzevkZUO22prrsXDhwkb3MVruvXcqd942UGP20MydO7eLpRm/rIfCeiish8J6KKyHwnooxlI9jIlkMyI2BH4AXEltgFBmfqEt9McRcTFwDHDqqBVwEDNnzqSvr6+x7d94ywJmzJjR2PZH0/Tp67LBpht19Nq5c+cya9asLpdo/LEeCuuhsB4K66GwHgrroWivh/7+/sYbyJan5/NsRsRawE+Bh4HdMvOxFbzkHGCjiFiver4I6IuIVdviplWP99Xi1h5ge9NqMZIkSeqiniabVYJ4LrA+8JrMvLeDzbT6am7etnwL4CHgT7W4qA0YqsfN72C/kiRJWoGeJZsRsTJlxPgLgdmZeccQXjOFMpXRHbXR7JdTRpnvXYtbqYq7MDOXVosvoLRs7lyLeyawfbVOkiRJXdbLPpsnA68DPghMjYhtautupFzePo0yEfsfKIniAcCOwFtbgZnZHxHHAcdHxN3A1VXcxsCba3G/i4jzgW9GxGHAg8CxwALGQP9PSZKkiaiXyWarhfGzA6x7OXAdpcXySMpl9scoieRumXlePTgz50QEwKHABpTR67tm5rVt290HmEOZwL2PcrvKPTPz4W4ckCRJkp6oZ8lmZj57CGGvH8b25lASyeXFPAS8s/onSZKkhvV8NLokSZImLpNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNWblXu04IvYE/hWYBawD3Ap8GfhqZi6pxc0GPglsAfwJODEzvzTA9g4HDgI2BG4AjsjMi9pi1gBOAPYAVgUuAQ7JzNu7fXySJEnqbcvmYUA/8AHgtcCPgC8Cn2kFRMS2wLnANcBs4NvAiRHxrvqGqkTzeOBkYFfgFuD8iHhR2z6/B+wGHALsDcwALoqIqV0+NkmSJNHDlk3gdZl5d+35JRGxOnBwRByZmf3AUcDVmfmOWsxGwMcj4muZuSQi+oAjKS2ecwAi4lLgeuCjwF7VshdTEtFdM/OCatn1lBbV/YBTGj5eSZKkSadnLZttiWbLNZTL2+tUSeQrgLPaYs6gXCrfsnq+HbAWcGZt24uBs4HZETGlWrwL8ABwYS1uAXBZtU6SJEldNtYGCO0A3Af8BdgYWAW4sS3mhurxedXj5tXjTQPErQ48oxY3v94ftBb3PCRJktR1vbyM/gQRsRWwP3BMZi6OiGnVqvvbQhdVj+tUj9OA/sx8ZDlxd1Zx7dtqxa0zwPIVmjdvXicvG7LV1lyPhQsXNrqP0XLvvVO587aBGrOHZu7cuV0szfhlPRTWQ2E9FNZDYT0U1kMxluphTCSbEbEh8APgSmoDhMa6mTNn0tfX19j2b7xlATNmzGhs+6Np+vR12WDTjTp67dy5c5k1a1aXSzT+WA+F9VBYD4X1UFgPhfVQtNdDf39/4w1ky9Pzy+gRsRbwU+BhYLfMfKxa1WqZXLvtJa0Wz/tqcX0RseoQ4tq31Yq7b4DlkiRJGqGeJptVgngusD7wmsy8t7b6VuBRlvXJbNmiepxfPbb6ag4U9xBlbs5WXNQGDNXj5iNJkqSu61myGRErU0aMvxCYnZl31NdXUx9dTDV1Uc0+wJ+Bq6vnl1NGme9d2/ZK1esuzMyl1eILKC2bO9finglsX62TJElSl/Wyz+bJwOuADwJTI2Kb2robM/NB4Fjg1xHxdeB04CXAgcBBrVHlmdkfEccBx0fE3ZQk9ADKaPY3tzaYmb+LiPOBb0bEYUBr+wuAUxs9UkmSpEmql8lmq4XxswOseznwq8z8bUS8nnJ3oH2BhcD7MvMr9eDMnBMRAIcCG1CmM9o1M69t2+4+wBzKBO59lNtV7pmZD3fnkCRJklTXs2QzM589xLgLGMJl7uruQXNWEPMQ8M7qnyRJkhrW89HokiRJmrhMNiVJktQYk01JkiQ1xmRTkiRJjTHZlCRJUmNMNiVJktQYk01JkiQ1ZtjJZkTsPMD9xSVJkqQn6aRl86fAnRFxQkS8qNsFkiRJ0sTRSbK5O3AZcBBwdURcFxGHR8SMrpZMkiRJ496wk83MPDcz96Lcg/xA4G7g08AdEfHziHhLREztcjklSZI0DnU8QCgzH8rMb2XmTsCzgI8A6wOnAXdFxHciYqculVOSJEnjULdGo68EPBXoA6YAjwCvBH4REddExMwu7UeSJEnjyMqdvjAi1gL2At4CvAT4O3A+8KHqcQmwG/B54NvA1iMtrCRJksaXYSebEbE7JcHcBVgVuAp4D/C9zLyvLfxHEbEucMoIyylJkqRxqJOWzf8H/An4AnBaZs5fQfx1wOkd7EeSJEnjXCfJ5quBizJz6VCCM/NK4MoO9iNJkqRxbtjJZmb+somCSJIkaeLp5HaVn4+IW5az/uaIOGFkxZIkSdJE0MnUR7sCZy1n/VnA6zorjiRJkiaSTpLNZwK3L2f9HVWMJEmSJrlOks0HgecsZ/1zKZO6S5IkaZLrJNm8GHhnRGzUviIing28s4qRJEnSJNfJ1EdHAbOBeRHxbeCGavlMYD9gMfCxrpROkiRJ41onUx/dEhEvAU4GDmlbfSlwSGZmNwonSZKk8a2je6Nn5g3AjtWtKJ9bLb41M+/tWskkSZI07nWUbLZk5j3APV0qiyRJkiaYjpLNiFgJ2JnSqjkNmNIWsjQzPzHCskmSJGmcG3ayGRFbAT8A/oEnJ5ktSwGTTUmSpEmuk5bNU4DVgN2B32Tm/d0skCRJkiaOTpLNFwIfzczzul0YSZIkTSydTOp+J4NfPpckSZIe10my+WngwIhYs9uFkSRJ0sTSyWX0dYC/AX+IiO8D/0O5a1Dd0sw8YaSFkyRJ0vjWSbL56dr/3zVIzFLAZFOSJGmS6yTZfE63dh4RmwCHA9tQ7q0+PzNntsWcCrxtgJfvmZnfb4s9HDgI2JByz/YjMvOitpg1KInwHsCqwCWUW2ze3oVDkiRJUk0n90a/o4v7fz6wK/A7Sv/RwfqQ/hH417ZlN9efVInm8cBHgKuBA4HzI+LFmXltLfR7wJaU+7o/CBwLXBQRL8jMh0d2OJIkSarr+HaVEbEpsCOwPnB6Zt4eEatQWhX/nJmPDmEz52Xmj6vtnQpsNUjcI5l5xXLK0gccCZyYmXOqZZcC1wMfBfaqlr2YktzumpkXVMuuB24F9qPMISpJkqQuGfZo9Ih4SkR8DZgPfJXSMvjcavUqlATvkKFsKzOXDHf/g9gOWAs4s7btxcDZwOyIaE3VtAvwAHBhLW4BcFm1TpIkSV3UydRHHwHeDnwM2JbanJuZ+VfKrSz/pSulW2bjiLg/Ih6LiGsiYu+29ZtXjze1Lb8BWB14Ri1u/gBJ7g3A87paYkmSJHV0GX1/4FuZeXxETB9g/fXAa0dWrCe4BriKkhCuBRwAnBkRq2XmqVXMNKA/Mx9pe+2i6nEdymT004D7B9jHoipmWObNmzfclwzLamuux8KFCxvdx2i5996p3Hnb3R2/fu7cuV0szfhlPRTWQ2E9FNZDYT0U1kMxluqhk2TzH4Arl7P+EWCNzorzZJn5hbZFP46Ii4FjgFO7tZ9OzJw5k76+vsa2f+MtC5gxY0Zj2x9N06evywabbtTRa+fOncusWbO6XKLxx3oorIfCeiish8J6KKyHor0e+vv7G28gW55OLqP/GXjWctbPAro5Yn0g5wAbRcR61fNFQF9ErNoWN616vK8Wt/YA25tWi5EkSVKXdJJs/gD492o0estSgIiYDexLGZgzmlp9NTdvW74F8BDwp1pc1AYM1ePmN1c8SZKkyamTZPNoYAGlL+XplETzIxFxBfAT4FrgU90qYLsqUdwLuCMzW50AL6eMMt+7FrdSFXdhZi6tFl9AadncuRb3TGD7ap0kSZK6qJNJ3R+MiO2A9wN7Av9HSdZupSSiJ2Tm/w1lWxExlWVTDj0LWDMi9qieX1U9nkaZiP0PlETxAMr8nm+tlak/Io4Djo+IuymTuh8AbAy8uRb3u4g4H/hmRBzGskndF9Dj/p+SJEkTUUeTulfJ5PHVv5FYn9L/sq71fH/gXEqL5ZFV7GOURHK3zDyvrUxzIgLgUGADyuj1XdvuHgSwDzCHMoF7H+V2lXt69yBJkqTu6/gOQt1Q3Y+8vf9ku9cPY3tzKInk8mIeAt5Z/ZMkSVKDhp1sRsS3hhC2NDPf0UF5JEmSNIF00rL5CqrR5zUrAU+vHu8G/jbCckmSJGkC6GSA0LMHWh4RT6Vcmn4v8KoRlUqSJEkTQidTHw0oMx/LzJOAnwMndWu7kiRJGr+6lmzWXAu8tIHtSpIkaZxpItl8FeA0QpIkSepoNPpRg6xam9KiuSXw6RGUSZIkSRNEJ6PRjx5k+SLKXYTeBXy90wJJkiRp4uhkNHoTl94lSZI0AZk4SpIkqTGd9NncqJMdZeaCTl4nSZKk8auTPpu38+Q7CA3FSh28RpIkSeNYJ8nmAcChwDOBM4Cbq+UB7AMsAL4ILOlGASVJkjR+dZJsPh3oAzbJzEX1FRHxceAyYMPM/FQXyidJkqRxrJMBQu8CvtaeaAJk5r2UaY/+faQFkyRJ0vjXSbI5HVh9OeufVsVIkiRpkusk2bwCeE9EzGpfERFbAe8BfjfSgkmSJGn866TP5sHAr4ArI+Iq4JZq+abA1sB9wCFdKZ0mjCVLlnLXfQ939NrV1lyv49c2YWrfyqzxtFV6XQxJksaFTu4gdGNEvAD4EDAb2KNadQfwBeCzmfnn7hVRE0H/Y4u5/LqFHb124cKFzJgxdpLNnbbeyGRTkqQh6qRlk8y8C3hf9U+SJEkaUEfJZktEbAqsD8zLzAe6UyRJkiRNFB3dGz0i3hwRC4D5wK+BWdXydSPi5ojYq4tllCRJ0jg17GQzIt4IfBe4CfgAMKW1LjPvqZbv260CSpIkafzqpGXzo8AvM3Nn4LQB1v8OeNGISiVJkqQJoZNkc3Pgh8tZ/xdgvc6KI0mSpImkk2Tzbyz/DkIbA/d0VhxJkiRNJJ0kmxcD+0XEkyYajIgZwIHAz0ZaMEmSJI1/nfbZfDrwe+DdwFJgl4j4NHA9sAQ4pmsllCRJ0rg17GQzM28BXgL8GTiaMhr9/cAHgf8Gts/MBd0roiRJksarYU3qHhErAc8A7srMV0fENGATStL6x8y8u4EySpIkaZwa7h2EngLcChwBfC4zFwFXdb1UkiRJmhCGdRk9Mx8DFlL6aUqSJEnL1ckAoW9TRqOv2u3CSJIkaWIZ7mV0gJuBlYD5EXEa8EfgkfagzDx7hGWTJEnSONdJsvnd2v8/NkjMUmCFyWZEbAIcDmwDzATmZ+bMAeJmA58EtgD+BJyYmV8aIO5w4CBgQ+AG4IjMvKgtZg3gBGAPYFXgEuCQzLx9ReWVJEnS8AzpMnpEfDEiZlVPX179ex3wytrz+r9XDHH/zwd2Bf4A3DjIvrcFzgWuAWZTLuOfGBHvaos7HDgeOLna5i3A+RHRfp/27wG7AYcAewMzgIsiYuoQyyxJkqQhGmrL5sHAFcDczLw0IqZT7oH+qsy8dAT7Py8zfwwQEacCWw0QcxRwdWa+o3p+SURsBHw8Ir6WmUsiog84ktLiOafa3qWUSeY/CuxVLXsxJRHdNTMvqJZdTxlhvx9wygiORZIkSW06GSDUMmWkO8/MJctbXyWRrwDOalt1BuVS+ZbV8+2AtYAza9teTLmUPzsiWmXdBXgAuLAWtwC4rFonSZKkLhpJsjkaNgZW4cmX2G+oHp9XPW5ePd40QNzqlInoW3HzB0hyb6htS5IkSV3SyQCh0TStery/bfmi6nGdWlx/ZraPiq/H3VnFtW+rFbfOAMuXa968ecN9ybCstuZ6LFy4sNF9jJb+WGdExzKW6uHee6dy5229uVnW3Llze7LfscZ6KKyHwnoorIfCeijGUj0MJ9l8bkT8c/X/tarH50XEXwcKzswrR1SycWDmzJn09fU1tv0bb1nAjBkzGtv+aOrrW7XjY1m4cOGYqofp09dlg003GvX9zp07l1mzZq04cIKzHgrrobAeCuuhsB6K9nro7+9vvIFseYaTbB5T/at70vRDlL6cSylzcY5Uq2Vy7bblrRbP+2pxfRGxamb+3wriBsoSptViJEmS1CVDTTb3b7QUg7sVeJTS1/LC2vItqsf51WOrr+bmlCmS6nEPUebmbMW9KiKmZObStrj5SJIkqauGlGxm5mlNF2SQ/fZHxMWUqYs+X1u1D/Bn4Orq+eWUUeZ7UyWbEbFS9boLa4nlBZSplHamSl4j4pnA9sB7Gj0YSZKkSainA4SqidRbUw49C1gzIvaonl+VmXcAxwK/joivA6cDLwEOBA5qjSqvktLjgOMj4m5KEnoAZTT7m1v7y8zfRcT5wDcj4jDgwWr7C4BTGz1YSZKkSajXo9HXB85pW9Z6vj9wamb+NiJeT7k70L7AQuB9mfmV+osyc05EABwKbECZzmjXzLy2bfv7AHMoE7j3UW5XuWdmPty1o5IkSRLQ42Szuh/5CieHr+72c8EQ4uZQEsnlxTwEvLP6J0mSpAaN9UndJUmSNI6ZbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqzMq9LsCKRMR+wLcHWHVyZh5ci5sNfBLYAvgTcGJmfmmA7R0OHARsCNwAHJGZFzVQdE1QS5Ys5a77Hh71/a625npd3e/UvpVZ42mrdG17kiQNZMwnmzWvAR6oPf9z6z8RsS1wLvAd4DDgJcCJEfFYZn6lFnc4cDzwEeBq4EDg/Ih4cWZe2/whaCLof2wxl1+3cNT3u3DhQmbM6F6yudPWG5lsSpIaN56SzbmZec8g644Crs7Md1TPL4mIjYCPR8TXMnNJRPQBR1JaPOcARMSlwPXAR4G9Gi6/JEnSpDPu+2xWSeQrgLPaVp1BuVS+ZfV8O2At4MxWQGYuBs4GZkfElOZLK0mSNLmMp5bNeRGxHrAAOBX4ZGb+HdgYWAW4sS3+hurxecDvgc2r5zcNELc68Azgzu4XW5IkafIaD8nm/wIfB64EFgOzgY8BzwH2A6ZVcfe3vW5R9bhO9TgN6M/MR5YTN6xkc968ecMJH7bV1lyPhQtHv29gE/pjnREdy1iqh5Eey0h0c7/33juVO2+7u2vbG01z587tdRHGBOuhsB4K66GwHoqxVA9jPtnMzJ8BP6st+kVEPAAcHRGf6FGxAJg5cyZ9fX2Nbf/GWxYwY8aMxrY/mvr6Vu34WMrAmLFTDyM5lpHodj1Mn74uG2y6Ude2N1rmzp3LrFmzel2MnrMeCuuhsB4K66For4f+/v7GG8iWZ7z22Ty7etySZS2Ta7fFtFo876seFwF9EbHqCuIkSZLUJeM12ay7FXiUZX0yW7aoHudXj62+mgPFPUSZm1OSJEldNF6TzTcBSynTIfUDF/PkqYv2oczFeXX1/HLKPJ17twIiYqXqdRdm5tKmCy1JkjTZjPk+mxHxM0oyOQ9YQhkg9G7gm5n5xyrsWODXEfF14HTKpO4HAgdl5hKAzOyPiOOA4yPibkoSegBlNPubR/GQJEmSJo0xn2xSLn+/HfgHSnlvAY4ATmwFZOZvI+L1lLsD7QssBN5Xv3tQFTcnIgAOBTagTHu0q3cPkiRJasaYTzYz873Ae4cQdwFwwRDi5gBzRlwwSZIkrdB47bMpSZKkccBkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUmJV7XQBJvbFkyVLuuu/hXhdj2FZbc70nlXtq38qs8bRVelQiSdLymGxKk1T/Y4u5/LqFvS7GsC1cuJAZM56YbO609UYmm5I0RnkZXZIkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGO+NLmncW7JkKXfd9/CKA8eBqX0re593SROKyaakca//scVcft3CXhejK3baeiOTTUkTipfRJUmS1BhbNiVpDBlql4DV1lxvTHcdsDuApBaTTUkaQ4baJWDhwoXMmDF2k027A0hqmXTJZkRsCnwJ2B54BDgTOCIzx+63tiRJ0jg1qZLNiFgbuAS4A9gDWB/4HLAe8KbelUySJGlimlTJJvBOYBrwj5l5D0BE/B04PSI+kZk39LR0kjRBjNZ0VKPRd9X+p9LITLZkcxfgolaiWfkB8C1gNmCyKUldMFrTUY1G31X7n0ojM9mSzc0pieXjMrM/Im4FnjeM7awE8Oijj3axaE+2ZMliVn7Kkkb3MVoW//2xjo9l1adOGVP1MJJjGYlu10OvjmOkBqqH8XosAxnqsYy1z0W70XpPRqMe/v7Yo/T3r9ToPrqhv7+/10UYE6yHol4PtXylJyfylKVLl/Zivz0REY8BH8vMT7ct/y/gL5n5L0PZzty5c7cHftNAESVJkpqyw6xZs/5rtHc62Vo2u+UqYAfgf4HFPS6LJEnS8qwEPJ2Sv4y6yZZsLgLWHmD5NGD+UDcya9asfmDUfxlIkiR16NZe7Xiy3a7yJkq/zcdFRB+wMcNINiVJkjQ0ky3ZvADYKSKm15a9Aeir1kmSJKmLJtsAobWBecDtwCdYNqn7RZnppO6SJEldNqlaNjPzfuAVwF+B/wd8HjgLeHsPiyVJkjRhTaqWTUmSJI2uSdWyKUmSpNFlsilJkqTGmGxKkiSpMZNtUvcxLyI2Bb4EbA88ApwJHJGZD/e0YCsQEXsC/wrMAtahTB77ZeCrmbmkFjcb+CSwBfAn4MTM/NIA2zscOAjYELiBUgcXtcWsAZwA7AGsClwCHJKZt7fF9aROI2J1yvytzwC2zszf19btC3wEeDalro7NzLPaXv9U4FjgbZSbEVwFvCcz/7stbkPgC8BrgKXAT4D3ZuY9bXH/TJl9YRZwH/CNar+N3QUrIt4KvJfyfj8MXA3s0yrbZDgfImJ3ynu9OfA34DLgQ5l5S1vchDknImIT4HBgG2AmMD8zZw4QN2bf/6GWbST1EBErAYcBu1b7WRm4Hjim/fgmcj0MED8LuBJ4JDNXb1vXk8/AUD6fKzKMz8WqwIeAtwL/ANwDXJCZB7bFjZvzwZbNMaSamukSYA3KiXEYsA/wrR4Wa6gOA/qBDwCvBX4EfBH4TCsgIrYFzgWuAWYD3wZOjIh31TdUfYCOB06mfAnfApwfES9q2+f3gN2AQ4C9gRnARRExtbatteldnR7NAD/oImIP4DTgh5R6+CXwvepDXPd5yhfJx4HXA49Sjm9GbVsrAxcCLwD2BQ4AtgPOjYgptbjnVvu5j/L+HE95rz7ZheMcUER8lPKD4/9RjvMdlC/Evmr9hD8fImInyvHPB/6lKtvzgF9GxJq1uIl2Tjyf8l79AbhxoICx/P4PtWxDsKJ6WI2SwPw3sD/wJsof8F9ExGvbyjSR66G+z6dQvjfuHiRk1D8Dw/h8rshQPhdPofz93Lcqz6uBD1Jm0anHjavzwZbNseWdlFtn/mOt5efvwOkR8YnMvKGnpVu+12Vm/cvhkqpl7+CIODIz+4GjgKsz8x21mI2Aj0fE1zJzSXVHpyMpv5bmAETEpZRf+x8F9qqWvZjyAds1My+oll1P+cW5H3BKtY+e1GlEzATeBbwf+Grb6k8A52Tmh6vnl0TE5sAxwE+r1z+jev2hmfn1atkVwG2UlsIPVq99I/AiYGbrWCJiIaX1bDbLblbwAeB+YM/qvbgoItYCjoqIz2bmfd07eoiIoCTbb8jMn9RW/aj2/8lwPuwD3AG8LTOXVvu7A/gd8BKq95uJd06cl5k/rvZ9KrDVADFj+f1fYdm6VA+PAM/JzEWtBRHxc2Azyh/8n1TLJno91B0IrEVJdA6tr+jhZ2CFn88u1sP+wLbAFpn5p9ry02v1MO7OB1s2x5ZdKBPM15v5f0BpMRzuL6hR1ZZotlxDabZfp/pwvIIyr2ndGZRLAFtWz7ejfNGcWdv2YuBsYHbtV+kuwAOUX6+tuAWUL5NdatvvVZ2eDJwE3FxfGBHPobRsndkWfwawdUSsVz1/NbAStfrKzIcof3zaj+/6epKUmZdTEpz2uB9VX6j1fbbel27bH7ijLdF83CQ6H54KPNRKNCv3V49TYGKeEytKQsby+z+Msq3QiuohMxfXE81q2VJKS+eM2uIJXQ8tEbEupbXuPZQWy3aj/hkYxudzhYZYDwdSEts/LSdm3J0PJptjy+a0Na1XH4JbKSf7eLMD5fLEXyj3n1+FJ186aH0ZtI6vde/6mwaIW53S/7EVN3+AD+8NPLGuRr1Oo/RT3AQ4boDVreMbrB6iFndXZt47QNxm1aWWVtxAl2Mer4eIeBqwUXtc1WfnYZqph22A6yLiyIj4c0Q8FhFXRsTLqvWT5Xw4Fdg8Ig6JiLUj4tnAHMrxtPpWTZZzom4sv/9DLVsjqvdxO554zJOlHj4D/FdmXjjI+l58Bob6+RyxKP1RtwRuj4jTIuKvEfG3iPhR1YLYMu7OB5PNsWUay1o96hZRBt2MGxGxFaV16/PVL65p1ar720Jbv+pbxzcN6M/MR4YQ176tVly9rka1TqtLMCcAH8zMvw4QMpx6aI9pxT2V8oWyorjWttYeZJ/tcd20IfAqyjlwKPA64EHgwirhmhTnQ2ZeQumr+clqH7cBzwFeVWtNmSznRN1Yfv+HWramHEJJYP6jtmzC10PVH3Af4H3LCevFZ2A062E65TiOoHyHvpHS1/1FwAVR+qK2yjSuzgeTTXVdlFGAP6CMJvzMCsInmuOAWzLz9BVGTmxPoXzxvzEzz65aKnajJJwf6GnJRlFEbAd8B/gm5RLUnsASykCF1XpZNo09Vcv/Z4E5mfmbXpdntEQZlX8K8LnM/GOvy9NDrZzsr8DumfmzzDyT8r3xfOANPSvZCJlsji2LWPZrq24a5XL0mFe17P2Uchlit8x8rFrV+vWzdttLWr+W7qvF9UWZ+mFFce3basXV62rU6jQink/pvP6x6pLp2iz7pb16lCkohlMP7TGtuMdYNjJxKMd3/yD7bI/rpkXAvVmbjiTLFBpXUKb8mPDnQ+WLwCWZ+b7MvCQzv0/psP9PlGlNWmVigHJNtHOibiy//0MtW1dFxAuBH1MG0R3Rtnqi18OBwNOBU2rfnatCGSld+2HWi8/AaNbD/ZRpmi6rt1pmmTbvQcp3Z6tM4+p8MNkcW25iWV8M4PFOuRtTpk4Z06oT/1xgfeA1bf1qbqV0+N687WVbVI+t42v1QRko7iHKtCCtuKh1hK7H1etqNOt0U8oMD5dQPpiLgPOqdZcAv2H5xweQ1eNNwPoR0X5pYgvg5lofnCcdXy1uPkBm/g1Y0B4XEc8CptLMubW8Ud2rMjnOh9b+/7u+IDPvpMybt3GtTLSXi4l3TtSN5fd/qGXrmojYGPgZZR7at7YNKIOJXw/PAzagHEfru/MI4GnV/z9VK/dofwaG+vkcseoH+e2DrF5KlYCvoExj8nww2RxbLgB2iojptWVvoIyMu2Dgl4wNVV+Ss4EXArMz8476+qp/2sVUUzLU7AP8mfIlC3A5ZfTc3rVtr1S97sLal/AFlF9YO9finkmZkLZeV6NZp/8FvLztX6v/0buAAzLzNsqHcu+21+4DXFUb1f9zyuXWx+srylRSr+PJx/eCahqOVtw2lImH2+N2j4hV2vbZz7KBKt30E2B6RDw+QrHqkL8tMHeSnA9QRr/Oqi+o/pitS/VHZRKdE48by+//MMrWFVW3o59X2949MwcahT3R6+EknvzdeRrwf9X/T6riRv0zMIzPZ7f8BNi+3s0mysTzawFzq0Xj7nyYsnRp+w8o9Up16WAe5Y/QJygthJ+jTEnwpt6VbMUi4qvAv1HmOWvva3RjZj5YdQD/NWWE7umUeQaPBQ7KzK/UttWarPbDlBP4AEpH6Rdn5rW1uJ9QLkceRrnEcCylSf8F1S/EntdpROxIadV8/A5CUe62dBbl1/ovKBMTv4cyF9pPa689iXKp9TBK0nI4ZV62F2TmwipmZeD3lE7lH6a0rJ4A3AW8JJfN7fhcSgvbxZS7REQV96XM/FADx/0U4LfAepR53x6qjmNrylxuf5gM50NEHEyp75Mol0inU+bHWw94fqv1f6KdE1EmjG5NrXIQpYXk/dXzqzLzjrH8/g+1bCOtB8pMHb+tlr+F8h49LjOvmAz10N44Ub3maODwfPIdhEb9MzDUz2c36qFKBq+lvMefpySLx1Peyy1bXdPG2/lgy+YYkpn3UwYR/JVy15HPU07wt/ewWEPV+uX0WcqXZ/3flgCZ+VvKh3RryiWjA4D3tZ+kWSap/QhlFPNPKZdYdq1/gCr7UH4FngKcQ/l19cqs3V5rLNZpZp5DGaW9B6UedgbePMCX1vsod9I4jtI9YTXK8S2sbevvlNuxzQO+S7mjwxWU/rJLa3F/BF5JSXLOpyQ8/0FJBLuuupy1K+WLqfX+AOyYmX+oYibD+XAyZcLkHSh98U6k3D3k5fVuJhPwnFif8h6cA+wIPLP2/OXV/sfs+z/UsnWhHjagjDRenXJ+tH93TpZ6GI5R/wwM4/O5IkP5XPxP9f8p1fKTKA04r8xlYyDG3flgy6YkSZIaY8umJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqzP8HVvNfDeBqWn8AAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"IssueQuery\"])]\n",
+    "df1['delta'] = df1['ts'].diff()\n",
+    "ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('IssueQuery duration (usec)');\n",
+    "plt.show()\n",
+    "ax = df1['delta'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time between IssueQuery (usec)');\n",
+    "\n",
+    "# df1['delta'].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for SingleStream\n",
+    "if False:\n",
+    "    df1 = df[df[\"name\"].isin([\"QuerySamplesComplete\"])]\n",
+    "    ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "    ax.set_title('Inference time (usec)');\n",
+    "    plt.show()\n",
+    "    ax = df1['dur'].plot(figsize=figsize)\n",
+    "    ax.set(ylim=(0, 100))\n",
+    "    ax.set_title('Individual inference time (usec)');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoMAAAFtCAYAAAB8yGDhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAABA20lEQVR4nO3deZwcZbX/8U8WMkmAkIWwBAkgwgGM4DXyExQF5SKyCFxlEQQFRUAQRETZd81FiQsoKKAsV5TVK8tlEWVfJQ4IJJAjW4hhIIQkLJIwZJnfH+fppNL0bD29Tdf3/Xrl1emq6uqnz9TUnH7qeU4N6OjoQERERETyaWC9GyAiIiIi9aNkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERybHC9GyAi+WFmw4GfArsDawHnuvvRdW1UAzKz9YEXgIPc/bL6tmZFZvZd4AhgY3dfXO/2dMXMdgGuBjZw9zn1bo9Io1LPoIj0mJkdaGYdZrZVmbs4BjgEuBg4APhdxRrXD5nZ4WZ2YL3b0VNmtipwAnBOoyeCAO5+M/Ac0WYR6YR6BkWklrYDHnf3U+vdkAZxOPAacFnR8heBYcCiWjeoG18j2vU/9W5IL1wInGNmp7v7m/VujEgjUjIoIrW0BjCvUjszswHAUHdfWKl9NgJ37wDeqXc7SvgacIu7v13vhvTCdcB5wN7Ab+rcFpGGpGRQRPrEzC4DvgRsCJwP/CewELgcOM7dl5jZdsBdmdcU7oO5gbvPMLMW4Hhgf2A80Vt2DXCSuy8oet2FaV8nA0Zcdr7MzFYDTgP2JMYjzgIuBSa5+5L0+vWJsXgnAHPTe74PeAI43N2nFH22jYEzgO2BEcC/gJuz4xzNbG3gLGBXYBTwPHCeu/+qm7jNANYriseL7r5+qTGDZnZ6+nybAScBnyd6Di9Kz8cR8f8MEf/J7n5O0Xv2KM6dtHcDYPP0Htnl72lrZl0HcIa7n56erwKcDnwxtfdNYBpwqrvfm3ndlkTcPwEMAVqBU9z9rqL9r532twswFngZ+AtwjLu/BeDur5rZE8B/oWRQpCSNGRSRShgI3EYkWMcC9wDfJRI1gKeJMYKzgOnp/wcAc1Lv3p+A7wM3A0cSCcrhwPVpfdaniITkj8BRwHQzG0YkiAcCVwDfAu4kEoULS7R3n/R+FxJJ5frA/5rZSoUNzOyDwCPAjsAl6b2uJZKwwjZrAA8DnwMuAL4NTAUuMLOTu4nZ0SXicXQ3rwG4kvgifzzwEJHYfpdIgl4BjgOeAX5sZp/JtLW3cS728fT49x60sTO/Su/7p/S+PwLmAFtk2rktcB8wGjgzfZ4W4Pb0paKw3VrEz+crxLFwJHG5/f8BY4retxXYugefUSSX1DMoIpWwEnCtu5+Znv/azB4Fvg78yt1nA1eY2fHAa+5+ReGFZrYfkUx92t3vySz/O5HY7QDcnnmvTYCPuPs/MtuemFk+PS2+yMxeAH5gZue4u2f2sS6wkbvPT6934AYi8fu/tM35xDnyQ+7+Qua9Tsrs5wdEovKhzGzVX5vZxcCJZvZLd3+9VMDc/Xoz+0FxPHqg1d2/ntpyETAD+DHRc/bDtPxKoI24rHtnet2+9C7OxTZJj8/3oq3FdgUudvdjSq1MydqFwP3ADulyOWb2a+AxYBLLk9Kzid7Fj7v73zK7Ob1E0vc80Wu7NhEXEclQMigilXJx0fP7iN6u7uwN/BOYZmarZ5bfA3QAn2bFJOXBbCKY2cf9wGtF+/grkbBtB2STwT8WEsFMWwHeD2BmY4FtgfOziSAsG89XSFz2JHq5Oore93bgYOBjwJ87++BlWnapM12C/ztxqfu3meWvpwT3/ZnX9TbOxcYAS4E3+tD2N4CPmdk67v5SifVbEJf+fwyMMbPsur8AR6byRO8Ql31vLUoEgeU/o4zCz3p1lAyKvIeSQRGphEXu/nLRsvlEb0x3NiYSgM7qwK1R9Py5TvaxRS/2MTP7xN3np8Sj0N5CEjW1k/1BjFEbRfS+fa2H71sJM4uev0HE/5USy9fMPO9tnEsZkP4VJ1s99T1iLOlMM3uMGFrwu0yv7cbp8belXpyMAd4lxnB29fPJKvQUlttukaamZFBEKmFpH147EHiKGG9XSnFPTqmZwwOJy6H/3ck+ii9tLulku96MKSuMub6SGFNYyrRe7K+nSrW9s/hnP09v41zstbS/1Vje0wadJFhmNqh4mbtfa2b3EUXHP0uMw/y+mR3o7n9geUyPJ8b5lTIntaE3Ckn+a718nUguKBkUkXp7DpgI3FHi8l5v9rGqu/+1gm0CmNDFNnOAt4DBfXjfWvZU9TXOT6fHDVgxGSz8f2TR9uuV2knqwbwQuNDMRhITcM4A/sDyuL/VVUzN7F1iJnJXP5+sQpuLe09FBM0mFpH6u5q4nPnN4hVm1pLuetGTfWxpZjuX2MeqqaRKj7n7a8RYugNTSZXs/gakbZYQNez2MLMtiveRxh125216dim9Evoa5wfS40ezC1Mh59eIWd5Zhxe9x6BU/if72teJsjQj06JW4FngmFLtKcTU3ZcSYzV3MrOPldiuuId3IvBwH75siDQ19QyKSL1dQUzEOD+VFbmfuBxpxKSHvYC7u9nHOUTJlxvM7HIiqRhG9BztBXyImHXbG0emtrSa2YXEpebxRE3FjdI2xxOTUx5KM4inEcndh4kJDkO7eY+/A4eb2WnE5I5/u/tNvWxnT/Upzu4+08z+Qcw6vqho9W+A483sN8Rn+hTLx/8VrAq8ZGZ/BB4nevY+Qcxw/mV6j6Vm9nViLOFTZnYJUX5nHDGhZwAx0QWipM4OwN3p5/MUkex+gYj9DFhW/mdz4Nfdh0gkn5QMikhdpQTgC0SNva8S48kWEsnXBURB6O72sTDVoDuBSGwOIC7hPkMUhO715UF3fzLdg/ks4FAiufwXcFNmm1dTz9QpwB5Er9s84pLqd3vwNmcSZW6OISZEvJjdfyVVIs7E2MizzWzloruQnElMqNmTiP+twE7Aq5ltFhDlenYAdiPKEb1A1KU8N9POe1PcTyF6F0cQP78prDiT+uUU+7OIsjkjiXGPt7Pi2MAvEhNOru7B5xPJpQEdHeo1FxGR7qVLt88Tdwzp8g4rjSL1Zt6dvWuMiKxIYwZFRKRH0i3ezga+Z2YNf2XJzHYBPkAUqxaRTqhnUERERCTH1DMoIiIikmNKBkVERERyrOHHfDSi1tbWFmBL4GU6v5OBiIiISCMYBKwNTJk4cWJ78Uolg+XZkuU3thcRERHpDz5J1BhdgZLB8rwMsPHGGzNkyJA+7Wjq1KlMmNDTOyo1L8VBMQDFABQDUAxAMQDFACoXg3fffZd//vOfkPKXYkoGy7MEYMiQIbS09OouVyVVYh/NQHFQDEAxAMUAFANQDEAxgIrHoOTQNk0gEREREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmm29E1sLfefpcF7Yvr3YyKGN4ymFVX7tt9nEVERKTylAw2sAXti7ljysx6N6Mitt9yvJJBERGRBqTLxCIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4Nruebm9kHgGOBrYAJwHR3n1C0zWXAV0u8fC93v65o22OBI4C1gGnAce5+R9E2qwLnAHsCQ4G7gCPdfUYFPpKIiIhIv1LvnsEPArsAzwJPdbHd88DWRf/uzG6QEsFJwPlpn88AN5vZFkX7uhLYDTgS2AcYB9xhZsP7+mFERERE+pu69gwCN7n7DbCsB/CjnWy30N0f7mwnZtYCnAz83N0np2X3AE8CJwF7p2UfIxLFXdz9lrTsSeA54EDggr5/JBEREZH+o649g+6+tEK7+jiwGnBVZt9LgGuAncxsQFq8M/AGcFtmu5nAA2mdiIiISK7Uu2ewpzY0s9eBlYGpwNnufnVm/abp8emi100DVgHWAWal7aaXSEKnATtWutEiIiIija7eYwZ74jFikskexKSPWcBVZnZgZptRQLu7Lyx67fz0ODqz3esl3mN+ZhsRERGR3Gj4nkF3P7do0Q1mdidwBnBZ7Vu03NSpUyuyn9bW1pLLh40YS1tbW0Xeo97mzh3OrBfmdLlNZ3HIE8VAMQDFABQDUAxAMYDaxKDhk8FOXAtcYGZj3X0O0bPXYmZD3f2dzHaj0uO89DgfGF9if6My2/TYhAkTaGlp6e3LVtDa2srEiRNLrps9bwHjxi3o0/4bxZgxq7PmRqVCH7qKQ14oBooBKAagGIBiAIoBVC4G7e3tXXZg9YfLxD1RGCu4adHyzYC3gJcy21lmQkl2u+nVa56IiIhIY+p3yWBK5PYGXky9ggAPErOE98lsNyhtd5u7d6TFtwAjyUwWMbN1gW3SOhEREZFcqfcdSIazvKTLesAIM9szPZ+SHi8nCkU/SyRyBwPbAQcU9uPu7Wb2A2CSmc0BHk3bbQjsl9nub2Z2M/BbM/su8CZwJjCTOo8/FBEREamHeo8ZXIMY/5dVeH4QcCPR43dy2nYRkejt5u43ZV/k7pPNDOAoYE2iXMwu7v540f73BSYTBaZbiNvR7eXuzTE4T0RERKQX6poMpvsBF4/fK7Z7L/Y3mUj0utrmLeDQ9E9EREQk1/rdmEERERERqRwlgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTH6npvYpH+6K2332VB++KK73fYiLHMnreg4vvtzPCWway68pCavZ+IiDQmJYMivbSgfTF3TJlZ8f22tbUxblztksHttxyvZFBERHSZWERERCTPlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTLOJpSaWLu3osmxKrcuq9MWixUvq3QQREZGKUTIoNdG+aAkPPtHW6fpal1Xpi49vPq7eTRAREakYXSYWERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHKsrnUGzewDwLHAVsAEYLq7T8isHwR8F9gF2Ixo75PAGe5+R9G+ZgDrlXibse7+Wma7VYFzgD2BocBdwJHuPqNiH0xERESkn6h3z+AHiUTvWeCpEuuHAScC/wAOAr4EvAT8xcx2LbH9dcDWRf9eL9rmSmA34EhgH2AccIeZDe/bRxERERHpf+p9B5Kb3P0GADO7DPho0fqFwAbuPr+wwMxuBzYmegz/r2j72e7+cGdvZmYfI5LPXdz9lrTsSeA54EDggr58GBEREZH+pq49g+6+tJv1S7KJYFrWQfQUlnNPsJ2BN4DbMvubCTyQ1omIiIjkSr17BnvNzAYCHweeLrH6y2Z2MLAEuB84wd0fzazflBiXWJyETgN2rEZ7RURERBpZv0sGibF+BhxStPxG4G/ATGIiyQnAfWa2pbsXxiOO4r1jCAHmA6N725CpU6f29iUltba2llw+bMRY2traKvIe9dZuo7v9LP3ls/bks5SrljGYO3c4s16YU7P366nOfh/yRDFQDEAxAMUAahODfpUMmtm2wI+Bye5+X3adux+VeXqfmd0KTAeOB75SjfZMmDCBlpaWPu2jtbWViRMnllw3e94Cxo1b0Kf9N4qWlqGMG9f5lf22trYu1zeS7j5LuWodgzFjVmfNjcbX7P16oqvfh7xQDBQDUAxAMYDKxaC9vb3LDqx6zybuMTPbHLgBuB44rrvt3X0ucCeQjeJ8YGSJzUcB8/rcSBEREZF+pl8kg2a2IfBn4FHggDSJpBxPx+5sQNHyzYheRBEREZFcafhk0MzWAm4HXgH2cPd3e/i61YHtgSmZxbcQPYM7ZrZbF9gmrRMRERHJlXrfgWQ4y0u6rAeMMLM90/MpwKtEGZg1gGOAzcxs2esLNQXNbF9gV+BWoij1+sSl5Bbg7Mz2fzOzm4Hfmtl3gTeBM4lJJ5dV4zOKiIiINLJ6TyBZA7i2aFnh+UHA3cAW6fn1JV5fuNz7AlF38KfE+L83gHuAPd29+PLvvsBkosB0C3E7ur3cvTlmaoiIiIj0Ql2TwXQ/4OLxe8W6W1/oIfx0D9/zLeDQ9E9EREQk1xp+zKCIiIiIVI+SQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiO9ToZNLMdzWxANRojIiIiIrVVTs/grcAsMzvHzLaodINEREREpHbKSQb3AB4AjgAeNbMnzOxYMxtX0ZaJiIiISNX1Ohl09xvdfW9gTeAbwBzgbOBFM7vdzPY3s+EVbqeIiIiIVEHZE0jc/S13v8TdtwfWA04E1gAuB2ab2f+Y2fYVaqeIiIiIVEGlZhMPAlYCWoABwELgP4G/mNljZjahQu8jIiIiIhU0uNwXmtlqwN7A/sAngMXAzcDx6XEpsBvwM+BSYMu+NlZEREREKqvXyaCZ7UEkgDsDQ4EpwLeBK919XtHm15vZ6sAFfWyniIiIiFRBOT2D/wu8BJwLXO7u07vZ/gng92W8j4iIiIhUWTnJ4GeBO9y9oycbu/sjwCNlvI+IiIiIVFmvk0F3/2s1GiIiIiIitVfO7eh+ZmbPdLH+n2Z2Tt+aJSIiIiK1UE5pmV2Aq7tYfzXw+fKaIyIiIiK1VM6YwXWBGV2sfzFt0y0z+wBwLLAVMAGY7u7vqUloZjsBPwQ2Iyav/Nzdf1Fiu2OJ2+StBUwDjnP3O4q2WRU4B9iTmA19F3Cku3f1mURERESaUjk9g28CG3Sx/v1E0eme+CDR0/gs8FSpDcxsa+BG4DFgJ6Jm4c/N7LCi7Y4FJgHnp30+A9xsZlsU7fJKov7hkcA+wDjgDt1CT0RERPKonGTwTuBQMxtfvMLM1gcOTdv0xE3uvq677wk82sk2pwKPuvvX3f0ud/8B8FvgNDMbmN63BTiZ6DGc7O53ErUQnwdOyrTvY0SieLC7X+nuNwP/BYwHDuxhm0VERESaRjnJ4KnE5eWpZnaumR2S/p1H1BQcCJzSkx25+9Ku1qck7zO8d4ziH4hLwR9Jzz8OrAZcldn3EuAaYCczG5AW7wy8AdyW2W4m8EBaJyIiIpIrvU4G3f0Z4vZzjxKXWn+d/n0LaAU+6e5eofZtCAzhvZeQp6XHTdLjpunx6RLbrQKsk9lueokkdFpmXyIiIiK5Uda9id19GrBdutXc+9Pi59x9bsVaFkalx9eLls9Pj6Mz27W7e/FYxex2s9J2xfsqbDe6xPIuTZ06tbcvKam1tbXk8mEjxtLW1laR96i3dhvd7WfpL5+1J5+lXLWMwdy5w5n1wpyavV9Pdfb7kCeKgWIAigEoBlCbGJSVDBa4+2vAaxVqS78zYcIEWlpa+rSP1tZWJk6cWHLd7HkLGDduQZ/23yhaWoYybty4Tte3tbV1ub6RdPdZylXrGIwZszprbvSeob911dXvQ14oBooBKAagGEDlYtDe3t5lB1ZZyaCZDQJ2JHoFRwEDijbpcPezytl3kULP3sii5YUew3mZ7VrMbKi7v9PNdqX++o3KbCMiIiKSG71OBs3so8Afgffx3iSwoAOoRDL4HPAuMdbvtszyzdLj9PRYGCu4KVGCJrvdW0RtwsJ2O5jZgKJ7K2+W2ZeIiIhIbpTTM3gBMAzYA7jP3V+vZIOy3L3dzO4E9gZ+llm1L/AKy8vRPEjMEt6HlAym3su9gdsyid8txGzoHUnJpZmtC2wDfLtan0NERESkUZWTDG4OnOTuN/X1zVOh50JJl/WAEWa2Z3o+xd1fBM4E7jWzi4HfEzOZvwEcUZgVnJLGHwCTzGwOkSQeTMxG3q/wfu7+NzO7GfitmX2XKKB9JjATuKyvn0dERESkvyknGZxF55eHe2sN4NqiZYXnBwGXuftDZrY7cXeRrwBtwHfc/dfZF7n7ZDMDOApYkygXs4u7P160/32ByUQPZwtxO7q93L05ZmqIiIiI9EI5yeDZwPfM7CJ3f7Mvb57uB9xtYunutxCXeLvbbjKR6HW1zVvEXVIO7VkrRURERJpXOcngaOBt4Fkzuw74F7CkaJsOdz+nr40TERERkeoqt2ew4LBOtukAlAyKiIiINLhyksENKt4KEREREamLXieDaYaviIiIiDSBsm9HZ2YbAdsRM4J/7+4zzGwIsBbwiru/W5kmioiIiEi1lHMHkoHAr4GvEzOBO4CHgBnAEOBJonbfTyrWShERERGpioFlvOZE4GvAKcDWZErDuPu/iVvVfaEirRMRERGRqionGTwIuMTdJwHPllj/JLBRn1olIiIiIjVRTjL4PuCRLtYvBFYtrzkiIiIiUkvlJIOvEPcR7sxEQDOORURERPqBcpLBPwLfTLOJCzoAzGwn4v7B11SgbSIiIiJSZeUkg6cDM4HHgN8TieCJZvYw8H/A48B/V6qBIiIiIlI9vU4G3f1N4OPAJGBN4B1gG2AVIlH8lLsvrGAbRURERKRKyio67e7vEMngpMo2R0RERERqqZzLxCIiIiLSJMq5A8klPdisw92/XkZ7RERERKSGyrlM/BnS7OGMQcDa6XEO8HYf2yUiIiIiNdDrZNDd1y+13MxWAg4FjgZ26FOrRERERKQmKjZm0N0XufsvgduBX1ZqvyIiIiJSPdWYQPI48Kkq7FdEREREKqwayeAOwIIq7FdEREREKqyc2cSndrJqJNEj+BHg7D60SURERERqpJzZxKd3snw+8BxwGHBxuQ0SERERkdopZzaxClWLiIiINAkldiIiIiI5Vs6YwfHlvJG7zyzndSIiIiJSPeWMGZzBe+9A0hODyngNZnY3sG0nq09w97PN7HTgtBLrv+fuk4v29xXgRGB9Yozjme5+dTltExEREenvykkGDwaOAtYF/gD8My03YF9gJnAesLQSDQQOB0YULTsgLb8ls2whcau8rBezT8xsT+ByYrbz7cAewJVm9qa731qh9oqIiIj0G+Ukg2sDLcAH3H1+doWZnQY8AKzl7v9dgfbh7k8VLzOz84An3f2JzOKl7v5wN7s7C7jW3U9Iz+8ys02BMwAlgyIiIpI75UwgOQy4qDgRBHD3uURZmW/2tWGdMbONgC2BK3r5ug2ATYCrilb9AdjSzMZWpoUiIiIi/Uc5PYNjgFW6WL9y2qZa9icuQf+haPkwM3sVGA08C/zC3c/PrN80PRb3NE5LjwbMqXBbRURERBpaOT2DDwPfNrOJxSvM7KPAt4G/9bVhXfgycI+7z8osexY4jhizuBvwEPDLNLGkYFR6fL1of4UeztEVb6mIiIhIgyunZ/BbwN3AI2Y2BXgmLS9cvp0HHFmR1hUxs62ADYFJ2eXuXnzJ+BYzAzjOzM5x97er0Z6pU6dWZD+tra0llw8bMZa2traKvEe9tdvobj9Lf/msPfks5aplDObOHc6sFxqvM7yz34c8UQwUA1AMQDGA2sSgnDuQPGVmHwKOB3YC9kyrXgTOBX7s7q9Urokr2B94B7iuB9teAxwIbAZMYXkP4Egg275Cj+G83jZmwoQJtLS09PZlK2htbWXixPd0sgIwe94Cxo1b0Kf9N4qWlqGMGzeu0/VtbW1drm8k3X2WctU6BmPGrM6aG5VVNrRquvp9yAvFQDEAxQAUA6hcDNrb27vswCqnZxB3nw18J/2rCTMbDOwD3OTub5axi6fT46bA9MzyzdKj96F5IiIiIv1Sn25HZ2YbmdknzGy1SjWoCzsCq9PzWcRfImoPTgNw9xeIJHCfou32Baa4e+NdLxMRERGpsrJ6Bs1sP6Jw8zpp0Q7AnWa2OvAgcLK7X1OZJi6zPzCXEvUAzayVKCbtwBAi4ftyakf2OuupwNVm9hzwF2B34LPALhVuq4iIiEi/0OueQTP7ItE79zTwPWBAYZ27v5aWf6VSDUzvuQoxS/gad19UYpNngaOB64mxgpsAX3P3H2Y3cvdrgYOIcY5/Jnob99PdR0RERCSvyukZPAn4q7vvaGZjgMlF6/9GhYtOu/u/ifqFna0vvvTb1b4uJ3oRRURERHKvnDGDmwJ/6mL9q4Du5iEiIiLSD5STDL5N13cg2RB4rbzmiIiIiEgtlZMM3gkcaGZDileY2TjgG8R4PBERERFpcOUkgycBawN/Bw4HOoCdzexs4EnivsFnVKyFIiIiIlI1vU4G3f0Z4BPEXTxOJ2YTHwN8H/gHsI27z6xcE0VERESkWno1m9jMBhG1BWe7+2fNbBTwASKpfF6Fm0VERET6l96WlhkIPAccB/zU3ecT9/0VERERkX6oV5eJU8HnNmKcoIiIiIj0c+VMILmUmE08tNKNEREREZHaKucOJP8EBgHTzexy4HlgYfFGVbg3sYiIiIhUWDnJ4BWZ/5/SyTYdxD2CRURERKSB9SgZNLPzgMvdvRX4dFq8CtEjuKRKbRMRERGRKutpz+C3gIeBVne/x8zGEPcg3sHd76la60RERESkqsqZQFIwoGKtEBEREZG66EsyKCIiIiL9nJJBERERkRzrzWzi95vZ/0v/Xy09bmJm/y61sbs/0qeWiYiIiEjV9SYZPCP9y/pFie0GEKVlBpXbKBERERGpjZ4mgwdVtRUiIiIiUhc9Sgbd/fJqN0REREREak8TSERERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiORYb4pO14WZHQhcWmLV+e7+rcx2OwE/BDYDXgJ+7u7vKYptZscCRwBrAdOA49z9jio0XURERKTh9aeewc8BW2f+TS6sMLOtgRuBx4CdiOTx52Z2WHYHKRGcBJwP7AI8A9xsZlvU4gOIiIiINJqG7xnMaHX31zpZdyrwqLt/PT2/y8zGA6eZ2UXuvtTMWoCTiR7DyQBmdg/wJHASsHeV2y8iIiLScPpTz2BJKcn7DHB10ao/EJeCP5KefxxYDbiqsIG7LwGuAXYyswHVb62IiIhIY+lPPYNTzWwsMBO4DPihuy8GNgSGAE8VbT8tPW4C/B3YND1/usR2qwDrALMq32wRERGRxtUfksGXgdOAR4AlxJjAU4ANgAOBUWm714teNz89jk6Po4B2d1/YxXZKBkVERCRXGj4ZdPc/A3/OLPqLmb0BnG5mZ9WpWQBMnTq1IvtpbW0tuXzYiLG0tbVV5D3qrd1Gd/tZ+stn7clnKVctYzB37nBmvTCnZu/XU539PuSJYqAYgGIAigHUJgYNnwx24hrgdGI8YOFy8MiibQo9hvPS43ygxcyGuvs7XWzXYxMmTKClpaW3L1tBa2srEydOLLlu9rwFjBu3oE/7bxQtLUMZN25cp+vb2tq6XN9Iuvss5ap1DMaMWZ01Nxpfs/fria5+H/JCMVAMQDEAxQAqF4P29vYuO7D6/QQS4DngXZaPCSzYLD1OT4+FsYKltnuLqE0oIiIikiv9NRn8EtBBlJtpB+7kvaVh9gVeAR5Nzx8E3gD2KWxgZoPS625z945qN1pERESk0TT8ZWIz+zOR7E0FlhITSA4Hfuvuz6fNzgTuNbOLgd8DnwC+ARzh7ksB3L3dzH4ATDKzOUSSeDAxG3m/Gn4kERERkYbR8MkgcXn3a8D7iPY+AxwH/Lywgbs/ZGa7E3cX+QrQBnzH3X+d3ZG7TzYzgKOANYnxhru4++PV/xgiIiIijafhk0F3Pxo4ugfb3QLc0oPtJpO5lZ2IiIhInvXXMYMiIiIiUgEN3zMoItWxdGkHs+c1VumiYSPGltWm4S2DWXXlIVVokYhI81MyKJJT7YuW8OATjVXoO2ot9j4Z3H7L8UoGRUTKpMvEIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRkUERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJscH1bkB3zGwv4MvARGA08BzwK+BCd1+atrkM+GqJl+/l7tcV7e9Y4AhgLWAacJy731G1DyAiIiLSwPpDz+B3gXbge8CuwPXAecCPirZ7Hti66N+d2Q1SIjgJOB/YBXgGuNnMtqhe80VEREQaV8P3DAKfd/c5med3mdkqwLfM7GR3b0/LF7r7w53txMxagJOBn7v75LTsHuBJ4CRg7+o0X0RERKRxNXzPYFEiWPAYMJS4bNxTHwdWA67K7HsJcA2wk5kN6Es7RURERPqj/tAzWMongXnAq5llG5rZ68DKwFTgbHe/OrN+0/T4dNG+pgGrAOsAs6rSWhEREZEG1e+SQTP7KHAQcEbq2YPoKZxCJHarAQcDV5nZMHe/LG0zCmh394VFu5yfHkfTy2Rw6tSpvf8AJbS2tpZcPmzEWNra2iryHvXWbqO7/Sz95bP25LOUq5YxqObn6Ity2jR37nBmvVDqIkL/1Nk5IU8UA8UAFAOoTQz6VTJoZmsBfwQeITOBxN3PLdr0BjO7EzgDuKxa7ZkwYQItLS192kdraysTJ04suW72vAWMG7egT/tvFC0tQxk3blyn69va2rpc30i6+yzlqnUMqvU5+qLcGIwZszprbjS+Ci2qva7OCXmhGCgGoBhA5WLQ3t7eZQdWw48ZLDCz1YBbgQXAbu6+qJuXXAuMN7Ox6fl8oMXMhhZtNyo9zqtYY0VERET6iX6RDKYE7kZgDeBz7j63jN0UxgpuWrR8M+At4KXyWygiIiLSPzV8Mmhmg4kZv5sDO7n7iz14zQCiVMyLmdnIDwJvAPtkthuUtrvN3Tsq3XYRERGRRtcfxgyeD3we+D4w3My2yqx7irjMezlwJfAsMJKYQLIdcEBhQ3dvN7MfAJPMbA7waNpuQ2C/qn8KERERkQbUH5LBHdPjj0us+zTwBNHjdzJxGXkRkejt5u43ZTd298lmBnAUsCYx+3gXd3+8Ok0XERERaWwNnwy6+/o92Gz3XuxvMjC57AaJiIiINJGGHzMoIiIiItWjZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRkUERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkWMPfm1hEpDtLl3Ywe96CejejIlZZbfV6N0FEckbJoIj0e+2LlvDgE231bkZFfPj9w+vdBBHJGV0mFhEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHBtc7wbUmpltBPwC2AZYCFwFHOfuC+raMBEREZE6yFUyaGYjgbuAF4E9gTWAnwJjgS/Vr2UiImHo0GHMntf/v5sObxnMqisPqXczRKQHcpUMAocCo4APu/trAGa2GPi9mZ3l7tPq2joRyb1FSzq4Y8rMejejz7bfcrySQZF+Im9jBncG7igkgskfgXZgp/o0SURERKR+8tYzuClwSXaBu7eb2XPAJr3YzyCAd999tyKNam9vL7l88aJ3GTxwaUXeo96WLF7U5WcZutKAfvNZu/ss5ap1DKr1Ofqi3Bg04mcp19Ili5vis7z7bjsvzS7vHNmyymhemv16ZRvUB8OGDGbl4SvV/H07+9uQJ4pBZWKQyVcGlVo/oKOjo89v0l+Y2SLgFHc/u2j5/cCr7v6FnuyntbV1G+C+KjRRREREpFo+OXHixPuLF+atZ7BSpgCfBF4GltS5LSIiIiJdGQSsTeQv75G3ZHA+MLLE8lHA9J7uZOLEie3AezJrERERkQb1XGcr8jaB5Gli3OAyZtYCbEgvkkERERGRZpG3ZPAWYHszG5NZ9l9AS1onIiIikit5m0AyEpgKzADOYnnR6TvcXUWnRUREJHdy1TPo7q8DnwH+Dfwv8DPgauBrdWyWiIiISN3kqmdQRERERFaUq55BEREREVmRkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRlsEGY2oOgxdz8bxeC9FIOgOCgGoBiAYgCKAVQ+BrkPaKNw945UFHukma3s7ksBzGxQfVtWO4oBmNkQM1vXzDY2s5GZGAyod9tqLfuZC3EoXt7sFAPFoFheY6DjoLoxUJ3BBmBmewN7AbsALwLPAH919/My2wzM/vCbjWIAZnYgsC+wA/Av4A3g/4AfuPuCtE1TxyDLzFYHNgG2BZyIyTPuPi+tH+DuTX0CUwwUAwAzGwdMBHYG/gnMAh5095fS+qY/L+g4qG4MlAzWmZltADwG3AHcBGwBbAZsTtwp5SR3v6Z+Law+xWBZDJ4AriHujmPAB4E9gAHAKe5+ftq26U96AGZ2K/EHcAEwHngZeJBIkK9y9/Y6Nq8mFAPFAMDM7gPeD8wF3kecF9uAG4BzM18Wm/bcoOOgujFQMlhnZnYpsDqwT+YXeh1gO2A/YGvgT0Qy0FavdlaTYgBmdgGwAfAFd1+Ylo0EPgR8FTgQuAc41N2frVMza8bMziJ6io8A/pEWHwHsDqwK3A9c4O5/r0sDa0AxUAwAzOwMYB/gQHd/2MyGAfsDOwH/AcwAfuTut9WvldWl46D6MVAyWEdpLNylwEjgi8BSoCMzTmwjIhHYB7jJ3Y+pU1OrRjFYNt7jXOIb3w7uviB72cfMxgK7AUcTl4gOKCTNzcjMhhKJ71/c/eSidesC3wb2JGJxuLs/22w9IoqBYgAxhhi4GZjq7t8pWjcW+BKRGA4krqDc3oQx0HFQgxgoGawzMzsJOAyY6O6vpmUrufui9P9BwPeAScBe7v7HujW2ShQDMLOvAj8Gdnb31rSsOAZfBi4DvunuF9arrbVgZtcBS9197/R8JWBJJkHeAbgCeA7YtTBmppkoBooBgJn9BtjQ3T+dnhfH4GPARcBQYDt3f7luja0SHQfVj4FmE9ff5cT4j/vN7FMA7r7IzAaa2RB3X+LuZwN/Bz5az4ZWkWIQYz6eA243s8/DshgMSknhEnf/H+DPwEfq2dAauRf4opkdYmaD3H2Ruy9NJ0Dc/S/A54jB1J+pZ0OrSDFQDADuBLY1s9PTFYPiGPwN+E9gFHF1pRnpOKhyDJQM1lH6xZ4FHAq8Bvwq/cKv4+5L3f3dwnbAC8AH6tjcqlAMgrvPJWYSPwhcZmaXmNmGKQks9A4OAF4F1qtjU2vlUuBq4vLH0WY2HpZ/SUjbzCAS6C3q0sLqUwwUA9z9D8CPiOEy55nZh9PywpfFgcSEgieATaw5S63k/jigyjFQMlhHhe5dd78XOAmYDnwFuMrMTjCzUWkixReJwcJX1a2xVaIYLOfuLwLfJ37pPwHcbWbnmtkmZvYfwEHE7OLf1K+V1WOZYuPu/hZwKnFimwSca2ZfMLO1fXkJjSHEwOnX69HealAMFIOsTGL3S+LqwR5EDI4xsw+kL4tLgRHAOkBbs4yV03FQ2xhozGCNmdmqwJZEnaABwAvufmla10LMGv08MUtsTaInaBFwu7t/ox5trjTFYNlM4c8S3fmrAE8CV7j7S2l84GeJGOxE9ATOBd4CbnT3o+vR5lpIcRkILEonP8zsIOC/gcHAQ0R9rZeBXYH13X2D+rS2OhQDxcBi4siawHBgji+vI7cH8YVxHWAe8eV5FnEuHenuG9elwVWS9+MAahcDJYM1ZmbXELNGVyH+wI8n/sj/DPiFuy80szWJy6FrpvW3AS96KjnS3ykGYGY3ABsTyXAb8GFgEPA/wPnuPt3MVgFGEyf+dYCHiT8MTVdPy0oXHb/H3X+S2eY44FPE8TAO+D3wB3d/uPYtrjzFQDGATovP3wqc4ctLbx1EnEM3B9YCrgSud/fH6tHmStNxUPsYKBmsITPbnZgs8WV3v9nM3kckBHsRl0afB45Il0yLX9sUU+UVAzCzXYlf2t3c/R4zWw0YC+wNHAu8DZzo7r8r8dqmiEGWdV10/G3gdHe/Im27CpFAd7j7v+vT4spTDBQD6Lb4/EAiIfx52naou79jmaoDzUDHQX1ioGSwhszsZ8Qlvz19xfsKDiUum55GFFo+AZgMDHT3JXVoatUoBmBmpxDf5nYpTJDJrFuX6P7fD/ipux9bhybWlPWs6PiNRNHxf1nMpGu2Y0IxUAywnhWfvw84zN29SWOg46AOMdAEkhrIDAKeSSQB66XlgwHc/R13vw84BLiEqLk3oZkOcMVgBf8CPk1841+Bu/+LmFl9KvBlM9upxm2rqTQ+chDQASybHenuL7n774lC2xcQE2q+A9Bsx4RioBjAsnPkYmIySHbiwOvp3HgCcW5YHZhkZsObMAY6DuoUAyWDNZC5rHcv8ct+RFq+2MwGpx8+7v48cDwxI+g7pfbVXykGK/gr8BRwnJmtBSsky7j728BPgFeAb1pzlooAlp3EnJgsNKowO9KW1856hugt/i1RTqHp6qgpBooBLDtHthJjpTdNy7IxmEMUnT8H+C/ggPq0tHp0HNQvBkoGa8jjzhJnAUeZ2S1pSvhid1+S+UHPA64FVre4B2VTyXsM0pi/WcAPiZlf15vZR4vHAaZLRFcTZQJG1L6lNaWi44oBKAag4vOg4wDqEAONGawyMxvs7ouLlu0PnA6sDfwUOMuXF1dehRg8vMDd96xxc2smxeBMYrZw08egMNi7aNnWxGf/f0TtwJ8Ar7j7m2a2OjGzeJG7717zBtdIuvyxNJ3wzgZWI74IXOzuL2W3I2ZMDnT3verT2spLvb4DUgy2IW5JOAK4jpzEAHQcZFkUE/4lcRnwBuCH7v5cZv0AoodwTXf/XF0aWSV5Pw7qeT5QMlhlFvfdvdbd/5lZthIxAHTf9K8D+BPwDjF77EPAVumSab9nUTtwbWCEuz+Rlg0gkqCvETdbX0pzx+BnxHHwYGbZAGAjoqD2YcAawN3EZfS1iJh90t1fqHmD68DMPg18i7g88hJwC/Brotbax4nLIgd5k9yb2sxGe9H9Q81sW+AootRQG00eg1JyeBwY8LxnZgSb2QeJIvO7ECW4/hf4FTCMiMvPgK+7+3W1b3Ft5PA4qOv5QMlgFZnZN4ALiZuMv2BFZUHMbCwxXXxbYE+i1t6TRNJwRz3aXGlmth1x+5ztiGEJc0k1sYjxMS1EvaztiaSoGWNwMPELbO7+XInjYDjRQ7odkRi/QVwq+l93n1KHJleVqeg4FneUuRfYvPjcYDGz/mvAzsQfgbVozhjoOIhbyz0ATPSoLZo9DoYQyeAONHHxeR0HjXE+UDJYRWY2B/g5cHYaEzeA6PIdD8zwVE08s/1q7v5G7VtaPWb2EnGj9b8RB/A2xIG9gPim+xuPGbSF7ZsxBnOAc4H/TsfBYOKm8psD09z9laLtm6puWDFT0XHM7AHiXtz7FA8fyGyzFrAhzRsDHQdxHMwBvpQ9DrLDiyxKy6xGFBVuuuLzOg4a43wwuBI7kfcys58Ss0Ev8eXTvo8Gvk5k9kMsagkdX/hhFpKg4p6j/srMDgPaiQLKhYTvGjM7mZgRdwrwSTM7uHA5uAlj8FPiNkG/zRwHpwH7E7/UQ83sWuBUd/e0fvF799QcLIqOf5bSRcdPAw4ws0LR8dlFr22KWdUWdxb4MHGZpz0tW5vo/RhJ9I5c6e4ziXNI9rXNEgMdB8uPg0+w/DjYmJgpvE4aE/Zzd3+WuNfsi5nXNksMdBw0yPlAPYNVkC7/zgYmufvJadmpRDJY6CV7H/Bl4lvhbh7TxZuKmR0AnAx8xN3fTgfusiLSaQLFJcT4h8+5+9P1a23lpUkgrwLnuvt30rIziLI61wH3EOMCv0l889/Z3f9ep+bWhKnoOGY2E/i9u5+Qnm9L9BxvDrwJLCTuMvAjd7/YmrOoro4Ds1nA7zLHwWeIqyXrE+PDWog7E51BTCJpuj/WOg4a53yg0jLVsQ3xLW4/MzsvjQv5NlFIeF93P4c4yA8nuv73rldDq+xZ4hf9MDMb5u4d6TJpoYjmQ0Q19QVEdf1mY8AjwDfM7EYz+xjwDaJH9Ah3vxL4BdFL2A4cXLeWVpmp6DgAZvZN4ovgSma2WVp8EfACMWZqDaK+5kzgRDPbuJlioOMgmNmXiXP/+y1mjQKcDzwO/D933wDYB7gKOIY4NpqGjoPQSOcDJYNV4O5/Iga93kp0gT9AFBm+pjAWzKOq/DXEfSi3slR0uclMAX4HfBfYM02UwFMRzfT/x4iBs582s6app2dmK7v7A8RxcDox1uMh4u4jNxR+od19kbv/jYjVZmY2rFkuf2S5io4X3E3MCNwfmJyGCKxE/I7c7+7vuvtVxHCStYk7TjQNHQfLPETcReKDwM/M7G6iJ/B4YgIdKRk6iojTV+rTzOrQcbDM3TTI+UDJYJW4+z3EN7pTiUKidxBdvsu+FaXZYs8TYzebJhksfMNJA6CPJu6leSnwSzPbxFJx6YwnieLKzfSt7ydm9ll3n050+R9CXOa4iRj/kz0OBgOzCi9sxstBBZ7zouPu/rS770oMDVgD+AIxZOAld+/I9Iy8QMy2X7lJvxzk9jhI46Gfd/dvAUcCM4irCNcTdUaXZpKh+cTVhSElzpv9Xp6PA2is84HGDFaQxRT5CcAniVmiN6fl44CR7v5Uujy6NC1fj0gSL3f3s+rV7kpKk0Z2JC77/jvzWQ8jLo8OIwqm3k3ccmdz4EdEr+nxdWhyxZnZocTYn+eJYQFT0vKRxO2FXig6DjYgblH3O3c/vT6tri3LWdHxzqTfi8Xu/pui5WOI5OAxdz+qHm2rNFMB/kJB6TW9qGSUme1DjIe7smj5KOI4cHc/pGYNrTGdD0I9zwdKBiskfZO7lKgF1AGMIS4Pf8lXrBo+KH3r2ZAYB7Gvu7+vHm2utPQt5m3gJHefXGL9eOKy6beIntAWooTAXe6+Xw2bWjUpBnOJX9ytidlfX/WiwtGZ42ATotfwi+6+Xq3bWwumouPAshP6Ku7+YtHyAakXYCWPW04NIeqOXkbUpmyKouOmAvyY2WPEbNnt3f3h7GSAEsdBC3EcXEpzHQc6H9B45wMlgxViZucRs57OIsYB/gfR43Wfu+9ftO0goqL8x4DD3P36mja2SszsQmAr4kT3Wlq2CZEgrwrcBTzh7q+b2W7AfKLA8j+9k9pK/U0mBtsSJSIuJi5xHFCiV2Qg0SO4CXCou99U4+ZWnanoOGa2BTFR6gBiqMhjwMleogB52v7bxBem6939ezVvcBWYCvBjZl8lErvZxFWRA9z9X6V6TNP2pxBfnq9z9+Nq2tgq0fmgcc8HSgYrwKI20hTgSE+3B0o9RMcAk4BPufuDtmJV8S2A8c2SAJjZB4DpxCXie9JA4G8RA6DXJWYMjyQGyx6W7S1tFma2EXGS/0IhwU9/BM8l/gic6O5vFF0iHgP8h7v/tU7NripT0XHM7ElgHvAoMUxie+AfwFe8qGCsma0LfI8oSv6VZhk/airAj5nNI+41+wJwBXAjUV/vPV+EzWw0kTSNAw5pouNA54MGPR9oAkllfJYoJfMMLOvmXUyMfZhB3FKITCL4PqKCfFMkgsmxxPE0OiWCo4kK8jcR3/Y3I7r/twJusKim3mwuAm4mZpEXXAdcTcwG+yJAJhEc5O5zmzgRzBYd/6W7X5PGu4wjLgGdAlxuZu8vvMYzRcfr0eZKs6gruRT4hketycOBnxDHwr5pm2WTx9IfwnOAo5ooAeisAP8DxNWCl8zs3OzkgCY8Dn5K9IL9xt2vJnp6diQmmg1L2yz7e5wmTZwNfLuJjgOdDxr4fKBksDL+TdxKZwZE0pd6fxYTycCyQa8W5VNuIsaJNYV0EruR+FxXm9mfiG++twI/cPdHgFfd/Vric38I+Ei92lsNaRzM08SJbtltotx9vrsfRCSEF5rZ18xsYPrC0Eyzp0t5m7h/5jyIE3pKgN9090OJOy+sDdxlZptmX9gMfwDT7/qOwP8Qk4lw96Xu/mvij9+X07JlY8bS8395zCLt99Ll36OJckovp2WnEn/4pxNDaS4lYvFo6l1fpkmOg3WJ0iknFYbPEJdGryBmkR4Iy78kFrj7QndfUMOmVpvOBw18PlAyWAEe9QJ3TZcACz/Awi/2bcAHzGyr9HwvorbUebVvaXWkA/oWYgzEgcAGRG/gLaQyKhlPEL0E69awiVXn7u3ufri7P5ldnvmWdw4x9uVE4IPNcHLrgbwXHV83/Vvsy+8zWzjn/gH4qJl9MLP9p8zsZmuu8hkqwB+fdRpRYqxw5ehNdz+MSIQnm9kh6UtiM/9N1vmggc8HzXzg1ZS7z0iPxX/kHwJeArYzszVYfmuhebVtYfW5+5tEkendiMvGj6Re0sLsqAHERJJBvDdJbEq+vLj0VOIP3WLgT2b2H3VtWG3ktug4gLtPI7703QHLkoDCl8T7ibqan0vrRhC9ZaOKxw31Zx4F+A8gxwX43f0bxK0mF6TnHZkk4GdED+nxwCbFvYNNppXoDc3z+eB8GvR8oGSwitIPu52YMfp54DhgkLufUd+WVU/6tjcTuMjdH02LC8fZCKLncEkaN5Mb6XLIs8SkonWJnpCmlOkdLxQdv5+cFR235UWDf5S+CKzwRdHdZwN/BnZNi3YnZlh+sbYtrR5bXjD3fuLuESeTvwL8QwDc/ZXs8kzyM5X4mS8E/s/ilpVNyaNu4NHAg+TsfFDg7pPcfWrqCW2o84FmE1dRpkfs88TU+QFEPbk/1bdltZe+BR5J/FE4xN1vrHOT6sbMjgGedPe/1Lst1ZKSoYGF3h8zO4T4pjucqJd1D01adLwgJUNLOhsSYGb/RfSUfAr4PXCTN0kpmYJMQli4LLYGMMbdn7YmL8BfkH4XlpY6DgoxMLPdiQlojwL7NdGY0c5uxPBN4CSW34Sgac8HRTF43N3/nJYvO/7T87qeD5QM1oDFnSceB55298/VuTl1YWYbE5cM72qWX/LeshI1pJpJ+qN3EnBxZrLASpmE8P1Ez/A3iS9GQ2m+ouOlYrCssHBmu4HEDeqnECU21nL3sbVubzV0dhwQiXH2j18zF+Dv0XFQ9JpvAke7u9WomVVlnd+IYW93fzn93L/K8qskzXo+KI7BvcSx/nJmu7qfD5QM1kg6Ga7s7q/Xuy31kn4xBntmtq00DzM7G/g+cZnnUuBcX15OqbjI8OeJcaNvAdO9eYqOdxWDUknhX4HPAHu5+x9r3d5q6CYGKxRYtuUF+LciCq9fX/sWV15vjoPMFaSViF7TV0rutJ+x0jdi+DFwr7/3RgyF88GbxK33muV80OObUaTt63Y+UDIoIn2WSojcS5SNmE9c7pkBTC4MCUjjw4Y065eBXsRgoC8vH7EJccvK0+vR5krrbQzS/ycQBfhvrk+rK6uc46DZWPc3YtjW3R8o/nLQTHoQg1I3o9iU6DU8tdbtHVzrNxSRpvQRYC1i9uyVwMHAHsAFZrYvMMmj7E67mQ0lak22AA800aXznsZgiUVdyg8Tl46aaYxcb2IwhEiUBhFlqJpFOTEYSnP9LpS8EYNF8e1DiMumD2TGkq5LJMcvdrbDfqi7GOwCPJhJBNcjekdPq0djNZtYRCphOjEQ/JpUKmQyMWHoSqLn5wYzOzuVTBhO1NXauYn++EHvYrAyMVh8jybrHepNDFZJy3fP8XFQiEGz/S709kYMN5KKbzeR3sbgemKoRF2OA10mFpGKMLMh7v5udtJIWv45opjsNsAbRN3NHYhB0k0xa7JAMVAMQDEAMLP13X1GifHC2xEzxz/h7g+b2deJ+xKv5U1Wf7c/xUDJoIhURVHpkBHAF4ib0m8DHOvuP61n+2pBMVAMQDHISkMkngEuAC4hyulc7E1cf7dYI8ZAyaCIVFWmhMhg4CpgC3ffqLvXNRPFQDEAxSAzc/oSwICHibqKa9e5aTXTqDHQBBIRqar0x28g8J9Ej8ie3byk6SgGigEoBhl/IsbIbU0T3XWnlxoqBuoZFJGaMLPxwP7uPqnebakXxUAxAMXAdCOGhouBkkERERGpKd2IobFioGRQREREJMdUZ1BEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiO/X/icxdMgxke1QAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAFKCAYAAACQBBKyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAACHR0lEQVR4nO2dd5wURdrHf7MLLDkHEUkCFiKY1hzOHFAx5/CennreneE8MZzhxHzmnD3jmdBTCZJEohIUlpyKuOSwwO4Cm0O/f1T3THdPdZye6ZnZ5/v56DLd1RW6u6qefup5noooigKCIAiCIAgi88kJuwIEQRAEQRBEMJBgRxAEQRAEkSWQYEcQBEEQBJElkGBHEARBEASRJZBgRxAEQRAEkSWQYEcQBEEQBJElkGBHEBkAY6wXY0xhjN2oO/YYY8xXvCLGWCFj7BMX6U5Vyz3VTzlJqM9UxtjUBMo5gzE2jzFWobarrd+8shnG2CeMscKw62GGMdaCMbaNMXZz2HVxA2NsDmPs+bDrQTQsSLAjiIBhjN2oCg3HhV0XIgZjrBWAbwHUA7gDwA0AykKtVIgwxgaoHwe9wq6LB/4OoAbAf8OuiEv+DeB2xth+YVeEaDg0CrsCBEH45ikAz/q8lkEIOJnG2QlcexiAdgCe5JyPDKg+mcwAAMMATAVQaDp3K9Lsw58x1hjA3QDe4ZxXh1wdt4wAsAfA7QD+FW5ViIYCCXYEkaFwzmsB1Pq8tirg6qSEBCf0zurf0iDqAoilQc551mn9OOc1YddBwgUAOgH4JuyKuIVzXs8Y+x+APzLGhnHOM/FjisgwSLAjiBSg2o9dDaAPgLcAnAmgAsCnAB7gnNfp0rYF8CqASwAoAEYCeEWS52MAhnHOI+rvHwEcCqAn51wxpf0ZQB/OeW/1dyGAqZzzG3VpDgDwJoCzIJYovwAwXlJu3LXq8akAwDk/Vf3dBMDDAM4D0BdAUwCLATzDOR8hv1P2SMroBWAdgAcB7ALwTwAHAFgE4G+c8zm6605Rs5nCGAOAT7U2MMaOBvA4gBMBNAFQAOBfnPMpurIfg9BwDQLwAIDzIYRE7Z6erbY3X73kVwD/5Jwv0OXxCdy/BxEAf4PQnjGIZzIfwBOc81906a4F8A8AAwFUAvgZwP2c83U29/FGAB+b7gcA3MQ5/0St56mc8166axQA76n5Pw7gQIj7fBvnfAFj7FYA9wPoDuA3Na+1pnId77MNFwPYyjlfasozrq7q8ceg6x/qsTMQe4Z5ALYCGMc5v0OXJg/iPboeQA8AOyGEyYc55+WmMq6G0CIOglgiXgLgBZNGeCLE0n8+gDku2kkQCZFWqnaCyHJyIASlXQDuBTANwFAAf9YSqJP5SAj7ry8APAJgf4iJ34mvISbV4/UHGWOdAZwKYLjVhYyxZgAmATgHQrh7GmLyTcTwuzWA2wDMgBB4Hoa4Bz8wxgYnkK+MqyCEivcg7lkvAN+ry3eAaM/r6r+fgbi/7wEAY+wUAL8AaA/gCQihLQ/ATxZOI8MhlnQfAfCamse1EM+2EkLIfAxC8PmFMdbfdL3je6DyPsSz2Kbm+TSAEgB/0BIwxv4J4HMI4XYogBcBnARgBmOsk/xWAQCmS+7HDepxO06A+Mj4r9pGBuBHxthfIITLdyDemeMAfKK/0Md9lpU910U6KYyxAQDGAGim1v3vEDaXJ+rSRAD8APEujQFwJ4RQ9zcAI9TzWtpHAHwF8fH1OMRS62qIPqSnQP17IggiBZDGjiBSR2MA33LOn1B/v8sYmwfgZogJEQAuhJi4H+CcPw8AjLF3ILQkToyE0P5cBWCm7vjlAHIhBD8r/gzgIABXcc6/Uct9H0JD5JdiCO1hdNmXMfYmgHkQQsi4BPI20x1AP855sVoOh7gf5wD4kXM+kTHWBsBdACZyzqeq6SIQAt6vAM7SNJ2MsXch2v4MhEChZwXn/DJdm1pACGCfcM7/pDv+IQAO4FEA1+qud3wPVEHnFgBvc85v1137iiZcMMZ6AHgSwGO6vMAY+xrAUghB6yHZzeKcr2WM/WK+Hy7oD+BgzvkataxiiPv3BMT9L1WPNwLwIGOsL+d8tc/7HEXNrw+EsOWXsyAEycGc85264//U/fsaAOcCOI1zPk1X/lwIAfosCEG0D4QwNwrApRJNaxTO+WbGWDWETSNBJB3S2BFEavnA9PsXCM2OxnkQTg2aoAd10njLKWPO+V4AYwFcwRjT9+2rIISRBTaXnwdgO4D/6fKrAPAfp3Jt6lOnCXWMsSaMsfYQWrzpiC1XBsV3mlCnoi1VHihLrOMwCK3TlwA6MMY6MsY6qvWcCOBYxlhz0zXvmH6fBaHB+1K7Xs0jV63HaZJynd6Dy9W/w8wX6pbZL4X4OB9uKrcUYslbVm6iTNGEOpXf1L/fa0Kd6bjWJj/3WU97ABGIjwW/aPW72NQ/9FwJYCWApaZ7Og1CM6fd00sg5s8n9UIdYHg+eooBdEyg7gThGtLYEUTqqOGcbzUdK4YQCjR6AtimCml6Vros42sAl0Fo/aYyxvaHWJp70uG6ngDWSIy73ZYrhTF2C4Tm6GCIiVnDV/w9Gzbof3DOi1W7sXby5FEOUv9+aJOmAwC9bdUa03ktj4kW15vvqZv3oA+A7SbNkhmt3BUW59daHE+EDabfmrC00eK41iY/91lGxOG8HcMhtKIfAHiWMTYZwmv1G9URSasnA1BkkYfmgNNH/bvUIp2ZCIJ/5wlCCgl2BJE6UuERNwbAXggt3VQIDUQO7JdhvWI1QeUC0C9JXQcxiY4G8ByAHRBevDfBuDQZBHUWx50EAU1z80/EbKHMmCf5Cos8bgSw2aE8ILj3QCt3MOTe0eZ6BoHVfXa6/37us55dEO+dTFC3ex+jcM4rVDu/P0BoqM+BsGO9hzF2sqqhzgGwDML+TsYWmzra0RbCCYMgkg4JdgSRXqwHcBZjrJVJa3eQ1QV61MlrFIDLGGN3QAh4CznnVlodfbmHMcZyTFo7WbnFEBOVmZ4waomuUH9fpF+eYozd5NySlKFp3/Zyzt3YMdrlUZRAHrI8z2WMdeKcWwk8WrkbOOfLfJSRSg1SQveZc17HGFsF1QPZhN37aM6nHuKDZyqA+xljfwXwNsSy9hdqPfMBTLJYUtXQ2nMIHBw6GGPdIDyAl9ulI4igIBs7gkgvxkL0y79qB1R7oNstr4jna4h4XzdBeCe60daNBdAFMdsuzVP2FknaNQCOU8OZaGkvgHBg0KNpcfSehAdC2CelCwUQnoz3qDtTGHDwLNWYAOGt+pD+nnjMw4xm6/iYJD/tfn4HcY8fNRvsq+mcbLq0+HtOy9VBEMR9ngHgKMnxNQDaMMYO1eXXFab3jDHWQXLtPPVvW/XvcIh+8FdzQsZYnq7uP0BoXh9ljOWa0pmfhWZPOhMEkQJIY0cQ6cVoiAns32qMtqUQ8bvae8hjAoQW42X1txvB7gOIWFufMsbyIZYUrwcgC2T8HwgBcDxj7BsIe6PrEW97NgpCEzJK1SJ2gwgbwQEc7qE9SUMNIHszRPiRZYyxjwBsgggxcwqEUGrrhMA536OG+/gCwHzG2FcQjig9IDwsl0Is03qp11Q1PtvfVA9MzYP4eIjYcc+onq3/BPACgJ6MsREQAmZvABdBCCmP2RQzH0IwfJCJ2IkVAH6zi3/nlyDuM4SX802MsUNMsey+hljq/4Ex9jqA5hCC2UoAR+rS/Uv1Nh4DsdNGOwB/gRBwf1TTfA7xbr+lLtv+qtaNQZg1XAERw3ENY+wJiPv7K2Psewj7wCMhQt7oP8TOUtvqO1QLQXiBNHYEkUaoS0UXQggJ10HELtsK4I8e8qgB8D2AVhATdaGLa8oBnAHgJwgB718QGob7JWknQIQrOQgikPLxELsCbDKl+1S9fgBEzLTLIBwp0mo7L875dAjN5mwIwfNNAH8CsBtCYHCTx3AIwWQDxL15HcKOcDnUeHk+uBnAPRCa0Ochnkl7CA9NrdwXIQT/aoi4ei9DaKqmQsRos6vzdojgx+0gBPuvEAviHDgB3OcxEHaaV5ry3QXR5nKI+/RHiLh/o03Xj4QwDfijWvbdEMLtiZzz9Wpe9RAfI/dBvLcvQIQ1OQ5iyXaRrtzH1bwaQ4R7eQpCAJygpVG17ZcD+Ix2nSBSRURRyFGHIAiCSH8YYw9CaOP6pOm2ZwYYY5dCaAH7SDyhCSIpkMaOIAiCyBReg3BE+L+wK+KSBwG8SUIdkUpIY0cQBEEQBJElkMaOIAiCIAgiS2jwXrEFBQWNABwAYFN+fr4syCdBEARBEERa4CS3NHjBDiKI5WoAJxcUFGxySkwQBEEQBBEiB0DsL90X8WGmSLAD0FX9+4ttKoIgCIIgiPShK0iwk7IVAA466CA0aRIXND5QlixZgoEDBya1jHSlIbcdaNjtp7Y3zLYDDbv9DbntQMNuf7LbXl1djZUrVwKq/GKGBDt126MmTZogLy8v6YWloox0pSG3HWjY7ae2N1wacvsbctuBht3+FLW9TnaQvGIJgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAgiC9hStA/XDxuHouKKsKtChAgJdgRBEASRBYyfvR6l+6rxy4JNYVeFCBES7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCCILUBQl7CoQaQAJdgRBEASRVUTCrgARIiTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEEQWEaFoJw0aEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAgiC1CUsGtApAMk2BEEQRBEFkHhTho2JNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCY2cEjDGrgBwHYB8AO0BrAHwDoD3OOf1unSDATwNYACAzQBe5Zy/IcnvXgC3A9gPwFIAD3DOJ5nStALwAoDLATQFMAXAnZzzQlO6fgDeAHASgAoAX6v5lbtoO0EQBEEQRFbhRmM3FEAVgPsAXABgBIDXATynJWCMHQ9gFID5AAYD+BjAq4yxv+gzUoW6ZwC8BeB8AKsAjGGMHWYq8ysAFwK4E8BVAPYHMIkx1lyXV1sIga8VhAA4FMA1AD5y0SaCIAiCyCoUULwTwoXGDsAQznmR7vcUxlhLAHcwxh7hnFcBeBTAPM75zbo0PQAMY4y9zzmvZ4zlAXgEQpP3IgAwxqYBWAzgYQBXqseOhRD6zuecj1WPLYbQFN4I4G21jNsAtANwOOd8p5quFsAXjLEnOedL/dwQgiAIgshsKN5JQ8ZRY2cS6jTmQyyRtlcFttMBDDel+RJiufVI9fcJANpALJdqedcB+AbAYMaY9iaeB6AUwHhdug0AZqjnoEs3SRPqVL6D0C4OdmoXQRAEQRBEtuHXeeJkALsB7ADQB0ATAMtMaTSNWX/178Hq3+WSdC0BdNOlW6G339Ol66/7fbC5TFV7uMaUjiAIgiAIokHgZinWAGPsKAA3AXicc17HGGunnioxJS1W/7ZX/7YDUMU5r7BJt0lNZ85LS9de99ttOlcsWbLE6yW+KCgoSEk56UhDbjvQsNtPbW+4NOT2p7rtO7aXAAA2bdqIgoKSlJYtg559OHgS7Bhj+0Esd/4OnfNENjBw4EDk5eUltYyCggLk5+cntYx0pSG3HWjY7ae2N8y2Aw27/WG0fd6mxQDfhwMO6I78/D4pLdsMPfvktb2qqspWGeV6KZYx1gbAOADlAC7knNeopzSNW1vTJZomb7cuXR5jrKmLdOa8tHS7db/dpiMIgiCI7IecYgm4FOxUYWwUgM4AzuWc79KdXgOgGjEbOo0B6t8V6l/Ntk6Wbi9E7DstHdM5U+jTrdD9Xm7OS3Xk6GNKRxAEQRANhgg5xTZoHAU7xlgjCM/VQwEM5pyv159XHRYmQw1XouMaANsAzFN/z4Twdr1Kl3euet14zrn2rTEWQhN3ji5dd4ggxGN1+Y8FcAZjrIPu2CUA8kzpCIIgCIIgGgRubOzeAjAEwP0AmjPGjtOdW8Y53wPgCQDTGWMfAPgCwIkAbgVwu+bdyjmvYow9BeAZxlgRhMB3C4SG7VotQ875b4yxMQA+ZIwNBaDlvwHAJ7qy34MIYDySMfYkhDbxZQDDOedmD12CIAiCIIisx81SrKY5ex7ALNN/RwIA53wWgIsAHA1gAoTA9g/O+bv6jNTAxA8BuAvCXq8/RCDihaYyrwHwI0Qw4m8hNH9n6rcK45yXQMTP2wfgewCvQMTS+5OLNhEEQRAEQWQdjho7znkvNxmpu0Q4LoGqwt2LDmn2QuwscZtDupUAznVTP4IgCIIgiGzHb4BigiAIgiAIIs0gwY4gCIIgsgCKdkIAJNgRBEEQRFZB0U4aNiTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQRBagKBTwhCDBjiAIgiCyC4p30qAhwY4gCIIgCCJLIMGOIAiCyDh2FJfT0iNBSCDBjiAIgsgo+PrduPmpifjpt/VhV4Ug0g4S7AiCIIiMYuP2fQCAZet2h1wTgkg/SLAjCIIgMgxagiUIK0iwIwiCIDKSCHl/GiF5lwAJdgRBEESGQT4T9kQo3kmDhgQ7giAMlFXU4LclW8OuBkE4QgIMQcRDgh1BEAZe/KIAT338O7bvLg+7KkSCjPplDdZv3RN2NQiCSCGNwq4AQRDpxdadZQCA6pq6kGtCJMoHI5YgJwKMfPGisKsSKLQSSxDWkMaOIAgTYtokw/TsoD4LpSDNxo7eUYKIhzR2BEEYiE2aNGv6pbK6Ftt30VJ28shCaZUgAoI0dgRBGNCmTBLr/PP8f+fijhenoLq2PuyqZDX08WGExF0CIMGOIAgT2v6bNGn6Z8maXQCycxk0HaBwJ/ZQ123YkGBHEIQBsl8KApI8UgG9owQRDwl2BEEYIJEkcaLCcbjVyFroHbWHNJoNGxLsCIIwQkuxBEEQGQsJdgRBGCDnicRJB4WJks1qG/r4sIVuS8OGBDtCypChI/HcZ3PCrgYRAgpJdglDdorJhV5RgrCGBDvCkl8Xbgm7CkQYaNoQmjb9kwbasjSoApFislpLS7iGBDuCIAxEtSEk1/kmplGim5gMSH6xh966hg0JdgRBGKBlxAAJ8R42CNmH3lGCiIMEO4IgTDQIkSCpkEYpydANJghLSLAjCMIA7RUbBJqdYphVyH7hh95QgoiHBDuCIAyQx2HipINMlQZVSBrZ3DaCSBQS7AiCMEKSXcKQA0pyIa0yQVhDgh1BEAbqKdxJVpAOWsNkoaTDUncaksWPnPAACXZE4IybVYiSvVVhV4PwCXnFBkA2S1XpBL2jcqjzNmhIsCMCZUvRPrz9v4V4lnatyGBIKAGAD0YuxvT5mxLO59IHRuPOF6cEUCOv0HMkiIZIo7ArQGQXNbX1AIA9ZdUh14TwC9kvCUZNXwsA+MMRB3i+Vi9S1dTWo3DrnoBqRQAgmZUgbCCNHREoZDSe+dAzTJx0WIlNhzokm4b+8UEQMkiwIwjCiEKG6dlANst12dw2gkgUEuwIgjAQ09iRaEekJ1FzgXCrQRBpCQl2DZAxM9bhq594UvJWSNuT8ZBXbHagZPVaLEl2UrL5kROuIeeJBsi73y8CAFxzNktaGaTtyWRodiAyA4q1KIeG34YNaewIgjBAXrFZQhbL51mtjCSIBCHBjiAIA7SjWHAk6x5WVtVixLQ1qK9PLwnn2c/m4JWv5qWsPPr2IIh4SLAjAoW+pDMfhSS7tOfTscvw4aglmLVkq2WaMLrijIVbMHnuxqSXQ+MMQVhDgh2RFOhLOoPRHGDoIaYt+8prAABV1XWWabLbeYIgCCtIsCMIwgAp7DIHkr0JgjDjyiuWMdYXwL0AjgMwEMAKzvlAU5pPAPxRcvkVnPP/mdLeC+B2APsBWArgAc75JFOaVgBeAHA5gKYApgC4k3NeaErXD8AbAE4CUAHgazW/cjdtI4KFtASZD4U7IdKf7NAq7yguBxSgc/vmgeRHoy8BuNfYHQLgfACrASyzSbcWwPGm/ybrE6hC3TMA3lLzXAVgDGPsMFNeXwG4EMCdAK4CsD+ASYyx5rq82kIIfK0gBMChAK4B8JHLdhFJgsIQZC4KGdkFB93CpJAtAYpvfmoibn56YuD5Zvp9IRLDbRy70ZzzkUBUM3eURboKzvlsq0wYY3kAHgHwKuf8RfXYNACLATwM4Er12LEQQt/5nPOx6rHFANYAuBHA22qWtwFoB+BwzvlONV0tgC8YY09yzpe6bB8REPTFmPnQXrHpjxvFeDYrz7O4aQSRMK40dpzz+oDKOwFAG4jlUi3vOgDfABjMGNOmkvMAlAIYr0u3AcAM9Rx06SZpQp3KdwCqAAwOqM4E0bCg3UOyimwW0LO5bQThl6CdJ/owxkoYYzWMsfmMsatM5w9W/y43HV8KoCWAbrp0KyQC5VIA/U35GZaGOedVEJo9fToi1dCAm7FEtSE0a6aMEdPWYN2W0kDzzGatFtnyEoQ1QQp28yEcLC6GsHfbBOBrxtiNujTtAFRxzitM1xarf9vr0pVIyijWpfGSjkgVNN5mPNliv5RJfDhqCe56aarn62yfEWleCaJBEthesZzz10yHRjLGJgN4HMAnQZWTLJYsWZKScgoKClJSjhvc1MVrfbfsrgYAVJSXx12bTm0Pg0xr/7x5BYF5HWZa2/UEVfcg+9uu3bsAAOvWFaIVdkjTVFSLBQ/FQ75BoS8vGWVv3rwHALBt+3YUFFQGnn9QuG17UPeoqEjoSNZv2ICCvN2B5JkImdzvEyXMtgcm2FnwLYC3GWOdOOdFEJq0PMZYU865vje2U/9qb2IxgB6S/Nrp0mjp2lqkW+GlogMHDkReXp6XSzxTUFCA/Pz8pJbhii83AYB9XdykkdBmYwkwfgeat2huuDZt2h4SGdV+9dkfdZSVj5Q3Mqrtenz2Af21erE4yP42eflcYP1m9O7dC/n53aVp9pZXA//bgoiHfBPG1I5kPfs1xSuBhXvQdb/9kJ8/IPD8g8BV2xN5xyTMWrsAWF2Gnj16ID+/dyB5+iVj+30AJLvtVVVVtsqoVAco1mzrDjYdHwBgL4DNunRM50yhT6cX2Jab81I9b/vAo2BHBIOShLXY14fPx0+/rQ88X4JoEGShraQSjWMXckXSFRc3ZmdJBd75biHq6oLyjSTShaQJdqpQdiWA9aq2DgBmQni7XqVLl6umG88516SCsRCauHN06bpDBCEeqytmLIAzGGMddMcuAZBnSkekiGTYZ038fQPe+GZBgDkSRPqQkCOAzQRO/gWEHW9+uwBjZxZi4aqdzomJjMLtzhPNEQsz0hNAa8bY5ervOerfTyGCCq+GEMpuAXAqgBu0fDjnVYyxpwA8wxgrAjBPTdcHwLW6dL8xxsYA+JAxNhTAHgBPANgAo73eexABjEcyxp4E0BnAywCGc87tAikTyYY+pQnCFb4EMBLaiATR3rt6+gLIOtza2HWGsJfTo/2+CcAoCE3cI2raGgih7ULO+Wj9RZzzFxljAHAXgC4QIUzO55wvNOV/DYAXIYIR50HsMHGFfqswznkJY+x0AK8D+B6xLcXud9kugiCIUElW6I6sDgmSxU1LGfTtnbW4EuzU/VmdXoOL3Baq7jrxokOavRA7S9zmkG4lgHPdlk0QBJFOJFtGyeb5O9P3ig0T7c5l9QdAAyXVzhNElqNQ7CyCiOJG8Ij2GR+dpqH2MxJFEkd7N+leZh8k2BEEQYSIH4cjN5NxQ1DENFTBNlAawHvS0CDBjkg7vp20EisKww+uaWbx6p0YMnQkdu+pRHllDW4YNh6LVhc5X0gQNmjzarKWFbNxtTIqtGZh24LAzW3J0TR2DeELoIFBgh0RKLFJyn8en41djvve+CWQ+gTJ6F/XAgCWF+7G+q17UbKvCv8da972mCC8odRrS7HBSinJiCmZNkRNPkiy84v2umXxW9JgIcGOSAo04DYctu0qo6/+BNDuXI6HLuPKLi/6SLKvL5rftlUbi1FUbN6CnHADdd3sgwQ7gnCJzDGkoY+JfP1u3PrMzxg3qzDsqmQsUaE4G9dMk4x2y+55dTr+9NRP4VYmY2noo1j2QYIdESxZPEbo59/oHJzF7XXD5qIyAGJ5mvBHsuS6bH4100nLdNXDYzD6l7VhV8Mz0aXYNLqXRDCQYEcESsMwao5E25fVdkyuaOjtTxw/S7EabkweslkRGHbTFEVBeWUt3h+xOOSaCLwIaRTuJHshwY5ICmEPuMkm29vnFbof/lF8fA25C3dCU3ayqa9Pz3vsSZhPzyYQCUCCHUF4RD9o0tzpTF29gro0nQDTAe0d8qOxSwd2labeaSGqKQ9ZHants5qTgQ8vQqsOWQsJdkSgNJRBgpYx3PPk15vxwJvpF74mXYh5uGaecLBi/W7c+MRPmDx3Q2oLTg+5DnV1qmAXdkV8EB3DaBDLOkiwI5JCKiepyupaLFu3K+nl0ADoH76+OOwqpC0JOU/YXONnRwuvrN+6BwCwdG1qnWfSpStmtMZO/UtL9tkHCXZEoIQxRrz5zUI88Oav2FFcntRyNG2kYQhv4INitjY/lZNd9L3yItml2X0PSzgIW5zSbOwyUK7LSA0x4Q4S7IiMZ+2WUgBARWVtUsuJaVYiFLXdRLZNEqmUU5IW7iQlbQjnuaeLlimInXbCJk1uJREgJNgRhFcitLOGGW2iLa+swazFW0KuTeKkcq5Luo1dJksdToTcNP3HXjrgReClUJzZCwl2hC3fTV6F8sqasKvhDoex9bOxyzBu5rpAi8zmr10/WpHXhs/HM5/Mwcbte5NQo+wkEa9YO3mioTgyeWXj9r0YPpEHkpdsNxonxs8qxCX3j0qyp7iLGsWM7JJYDyIMSLAjbPlkzDJ8NHqp6/Su9rB0cb2/i+1PfztpFd7+bpH//PXQ564BTWOxY7ewc6yoSu6yeNJJpY2djz1d00VoC3v3Aj+a84fenoHPx69AWUXiH6yKj7XYD0YuQW2dgpqauoTLNzPxd8072fmBaPcuPd4kIkhIsEszdu+pxJChI7Fg5Y7Q6mAWrvzYrvldmsiEj8cI9HJdBlTYJ26eRVyaNFmSSpSULsUiSQb4KWhEWE87EbvE6trgBCo//V97zvVJHOw87UCRtFoQYUGCXZrB14uwAWNmBLtkmCmks5ikF3jTxaYmmfh5FtkSQiEM54mgheJsMOy3IpH3K9Bn6yOkjBYaJZnvmKudSdJ6tCUSgQS7NCXMeTGJq6GW/Hfccnw1YYWvwlM1ccUmykjoS1DpSjYKEclGE1KStles92w901CFBD/CcywwcPw9W72pJJCPIld5pJnjhx8mz92A1ZtKwq5G2kGCHeFMCvYd/ObnlfjyJ57e00NaVy4J+BGys8ZuJ/U2dnYT7NadZfhs7DJPk75fAWHx6p2uPZszUSYIss5+9vnVBHiz88TiNTvxj1emYeT0tQHUyzmNthScLs+wdF8VKqvdm/3U1Nbjla/m44E3aFcbMyTYEXGEOSmTBizD0TSZ9eFWI1FSuhQL5wn2yY9+w7eTVmHrzrKk1+ehd2bgmU/mJL2cIPCjbUr2s527fDse+2CWpWBttRSrOR6t3VySzOpF0a9ApAPXDxuP+153L6Rp0RqqazN8sEkCJNilHenRycIjsyS7TLcls8Ndy4ypssWpJKW1d6Gxq60zTl7p9tqluj5BlDfs/Vn4+ff1CeVRrz6WHN1M+sSHs1GwYoflOxTVapsaEajtnYtMEo1gkAwK1S3q3FCjCnRNGucmqzoZS6OwK0CkIYkYJrvQPrgpOp0GGw39QBi1kwmzQmkIbSzuneiSmIu0cbc19H4SbgUSKZ1vKAbfUIwzj+npO4/YNoOSmlj0AW1sM3vFan0nCG9ZNyHyYnsJh/4S+ULzbm7ahAQ7M6SxIxxJZcdPt+UBGRHdHclmASab2+ZESveKdaGx89Mbsvn5pU3TJCZ2Th99Vpq5aBiUAAIXe9KYBzTUVlTVYsjQkZixKDU7z9TUCI1dHgl2cZBgR8QRro2d90ju0WuDrYp9/ukrd4aCJpPEvIXTZur1R0rDnThrub1UR1EUzFm2Le2M44MkXZYRZa4TTlWKaubq5Ro7P11nReFuY5+T5PH9lFVYu7k0liSBsVbGtl3C/vOrCSsCytEeTWOXR0uxcZBgRwRKwur9dHbBlzrAZbgAY4uf0DPZsUSd2gDFgqDe+ZmLt+KJD3/DiGlrtJwDyTc9Cbdtsg8Yp1BImsbOvORqddyJ35dtw31v/IKxutinMqXfxz8uw99fnhr9HfQ+tzkp7vuaYNwol8QYM3RH0ox0kGeCULb4trFL4Ppk37qYPU2srCCWTTIZy3elYd8WT7jZK9bqlOz4rtIKAMCO4vKE6uWGsDW06TBeAuZ62As4VmYcfpdiNU3Zph37dEfdOE+o9QnqHqb4XdBKyQl8y5bMhwQ7Iq2ILbH4CGMQdGVsmMeLAACbi5IffiIs/IzP0Yk+wyU7N5NTRVVtdFINoiw373yllz14XTyCRAPihiVYpctKv2yfX6d7EtVsxe/H57MS6tUeBRz9h2oQ5ESXmAPK0CXpItwDQE1tHYqKK8KuBgl2hIwERs2ABtx06qwydu+p9H3tnGXbUhKPLAxioRwSy+fbSSt1S4npycPvzMCtz/yccD5uNCfaubtfmaZek3hH+3XhZvzjlWmYNn9zwnmFhb9xIjipULbPbyR2UooWGiWovWJlFiJe9nn2KhBakXLtbZoI93pe/nIe/vTUT6itC7dyFO6EcMTL4JmopiZmo5d+6CfgROr3xIe/IRIBRr14USD1ShZenqQm0EXflQTHtc/GLgcAXHxKn8Qy8okbYWvVxpJAyoqFO7EuzGqulNZPcZEGsaW7Tdv3OlXRkTScY1OCdJ/fqK2Z/K4EHRJIlo8b4Spo54lU29hphKW9raiqxZ6yanRp3zx67Lel20SdwqlSFNLYEXGkh42d+wxSLQRGEElYoxj0YLRwZRFqVC+xdCDjl2LVvyl5t3wUlsrJrK6u3marp2Du0LotpRijM/53Il3eLz/OE1b4H1Piv4Y9bBUbnKNaqm3sQn4FHnz7V9zy9ETDMc0+MmzFBAl2aUrYL21opEkYA1siAQ6GAbBmUwkeeW8mPhy1NNB8/QzQfie1sCneU4lp8zbFn0jBc9Y0drbOEz6qEdQE+8RHv+GKB8c4FJZYGXe9NBXvfr/I/QUS27YwkGl2E62R38em1/h21mmRRJ4ytZ6/cqzICVgTme6s2VQad0zb/zfs6YEEOyKORPplop068K/IANG3LZ2qV1pWDQDYXLTPIaVH/DhPOHgEmqmuqcO6LfEDZKp57D+z8eIXBdhbLu4lAl6msmPDNrGNUrLe+URznbdih3XeIfWD6apdYIUXZ5IkYohjF3UgkhO04KPP77B+HQEATRrlWKaJHUuOEJI6r9j0lSDDnh5IsCOSgt84dmHb2G3cvhfPfTYnbm9OPZE009hp6v+cEOqkDa0/z9kg/uHRxu6t/y3EXS9NRXECzihO1NbVY/Qva1Fn80x3lghPtjrV6DmRsDteeW34AgBOz89LwBPB/JVFvuuU7pTsqwIAbN0ZzMfM356fhP+MXOL5OkWmsoODZBcwbuxBZVUJ+iM66hWbciO7FJeXAZBgl2akg7iQyBdXon0skXAnQfDa8Pn4deEWrHYwjE+H56SRtC9vH9fE5Dp3V/P1xQCAfRU1Pkpzx8hpa/D+iMUYP6vQMo05TIssjEXSsS0qvWevsJbfWjVvEkg+G7fvw8jp9l7YKwp346mPfosutwHyp+KksbPCa9+J1kE3ZlrHlYw/YXae2FFcjh9/XeupbD2xrdIaho2dLbQUS6QdCYRX0jq1X1f+wINmeq6A3Sn9Wmzyq+KWoCPI+ylbI2KxD6YV0Wj7SfzML6usUf9aL9tZaZhTeUuDKipZd1LqKJCkspw48dD9AQAH9WiXsjL//ekc/LZ0m1G7rN4SQ7iTkAYvu1KlGjv14MzFWwEAw96fhfd+WIxSVRvqufxUhzuJFhx/qHRfVUIhqTIdEuzSlHS2H7Bjwuz1AIBFq3f6uj4aNDMkY3G+QWiQ6iSChn67NN9bpiUBTYjODTgCuy/nCY/pc31uo+SWhSuL8OtCsSm5l/YEHQrCDfYCQQI1CUjQCDs2lx7tgyC1MlT82CQLVRPVvKVqKdZNGqmNnfg7flYh1m/bg33l4gPIb18MOoyLW2rr6qO7rWhcP2w8/vj4hJTVYduuMkNg4rBnBxLsiDgS6ZfGbW38Fx62DVtVtU3okEh6OU/Uh+iJZVWmWyFKuz5ZGrtH3psZDQbtqoQ4bXXqbmpOTgQjp6/BkKEjsU9z4nAgmdUbY1qWs7M7TfWHqNX7Vbh1TxLLtDkpfQ6W66LyLHw+TJkphuSz1DYP2/HOJSkXaNWCNmzbixuf+ClUR5pbn/kZf3rqp+jvsOcvEuyynJraOluj8aBxOyFZUR+CpkSG7S4ACL/j6kmrpViPQUqTrbHT42Zi1pK42b81aCIRYMLsQgDArgSWkeKeic983v1hseG3dBzxa1CWIFHDf1PrErERc1um3slFu9dbd5bpQl2401zpu+u+8mp8MHKxdeIEkUc7UWx/e0Vrd6J9ma/f7es6T9vtZTkk2KUZQU/Olz7wIx58e4anaxJZ0pQtYfqheG8VZqhLaE4kQ6Bx2lg6jeQ6XRy0NKqUy9cgOhmkwpXO5r229DlNqY1d/HJeWpGO75epSl620vJcpMRlXz9WLlgpwsL4uU2fjl2ObbvKfdVP/2Fnda2djZ0Zv2YmQdnY3fv6L67SmUtJxcdhpkCCXQNgeaG/L6Aw0G8g/exnc0Krh0xI0g9Y6TTHRcOdBG5j5/0a6w3O7UmFttGdLZIW7iT1k8TiNTtdC7h2t3fR6uSEOQnahjMRErHF9V2mzsY2Vg/JeR9KzERWVfT3Qvv3JC38ULRu1l6xcf/OUPvuoJQK2QAJdoQjqRzOUzGoTJ8v2WHAjN1SbCS9nCeUNNLY+V2ZS4UnnV0Rcbcumja193RzkbAHNAu6+p/3v/ELqtXt42S1m7Nsu/Tawq17ErJDapQbX1pozusSIUscT957JPXYlxQXDdLtoSpePmw2bt8r/QD4fsrqqNZv9pJtsWDbFvWMX7JP7Glq5bUMKASNI6b6p0TrnyGQYJemNFStsrndyRioX/i8wDGNbIhL150ntPEsEnBvTuTOu31sKdW4eGhRzJ4qOXVJhOWFuz05CeyrqMGu0grc+eKUhLTgOTlpOF2kVGNntKEDjO9U9HAS67R6Uwn+9vxkfD91ta5i4k9dvRINtG1GvhQbrI2dFouyZbPG0WOP/2c2PhzlPfCzHxrqnCkjDXsqEQZlFTWoqhFagDA7SNxgE1JdFABTCzZG70k6E67GzviAYlVIfEkxcGzLstD8pJME7xNFAW58Qnjs8QTMMuyE3OkLNvvON12prK5FTW2s/0fN+gw2drF/axovpzdG9s67fc22qxq5lRuKUbBiO1ZuKLZ8rY1OHrKlWN2/9XXxK5lGHY5i189dvh0jptkHft5TVo3rh43zUZyxTWRjF4MEOwIAcPUjY/H3l6aEXY20+epatGonXvpyHj4ZvTR6TB9WINHQBEESRLiTrTvL4u18/MSxS+ONwN3Z2Bl/p6tY5/c9atI4N+CahINVnMFE37tq3YfcFQ+Owd+enxyXecRKYFIPx5wI7MvyM4ZEHaVyInjsg9kY+tp0y3KCtrl1jcdil67dhdJ9iUVTAGgpVg8JdkQUzb4nFUtwltfHueCHgxa2pXhvfBT2CCK+l+iSIfDUS76UvVBUXIE///tnfDJmWWB1ctvM1Bq/23jFupyMg6S6pg41tXKjeaf74ree2SLYaZjvU6LLieZYfZrNGqAzeTCUp6uL6V/JsBfWhJdcK7WhDr1g5yrcSVDV9ZhPUGOpX+eJiqpaTJhdmPodM5IICXYhs2nHXrzw+Vzb4J9mvKTNONKkb9U57ubgU2Pnsz62eeq+4v2gbSG0eI1xtxA/dY3ONy4vTpex1Erzk0zB87J//oi/PjdJXh9dwdt3l8cF/vYbL1Im2A3/eaWra1P5rGYu2hK3m4C8Lql17QKsnSe0Dyu3fcDoZW+/bKqxUt0ZxyC0WaQ1LMVKzhvkoDTph4ngV2P37veL8Oa3C7FsXeZEj3CCBLuQefXr+Zg+fzNWbSgRBxzGqR27y3HJ/aPx02/rk1cp08CSysC3cXYSIc380RAiufLB0fctScZSrGSJyA9ubn3pvipTmASLvNJwpvBy64MSHJy0ANt3l0uP19crUW3eUx/9Fn/e5+3Naxwb8s1V27G7PC44bBjLW+WVNfj3p3Pw1Ee/YUvRPmwpit/NxkrwTuZwYbctl6iM+schSLesm7p9y0b9sjYuD8t4dAYB1NktNrBhPuDporqmTq6NM3vFenj4C1buwIX3jsTe8upoH5Rd/9VPHA+/4y0ObDpAgl3INMoVj2BywUZX6bUv918CMFYeN3MdPhq9NNStWMzECRfhVCMm2ElGO2Fj5y9fd3Zeird9TaN19Vcnt2zbVYbrh43HD1OtjaG92tily1JsNE3Ab5zf3F77ej4ufWA0AATaPxs3sl6KvfnpiXHBYYdP5NF/p6ovllWI9hbvrcJtz07Cbc/KtZpAMDKEWwcprf1GTVj8XfG1jO6xIW608w5RWQwfB0EIxF7z2FK0D9U1dY4fpJf980e8/EV8JINE+ur/Jq+CogBrNpVEn3+eRJv95YQVvvc9DxMS7EJGW+obP6sQG7fvdb4gwInw7e8W4Yepqw2DN5DoAJ7g1QmMMEF+rWtfb4alWL2dtG/nCec0F947ylNYCm3ZOFGv2Hjth7Gy2ibXc5Zvs84jenFCVUk9pgB8YWscubrk9sPU1ZZaPSD+Njttx6QXCNy8Lobg5gF0sJra+ujSv4w9ZdW6gLvWFbR6PvX1CuYu3+5pHNm+q8xVOu0D6relsfffLgSSF+cJJyHMjJNwaa6Qmxh8svqWVdRgyNCR+H7K6viTCVBRVYvbnp2E14cvcPWs3Hhda0oSN8S2P4vtk5vXJHvsTxu5ScQY6wvgXgDHARgIYAXnfKAk3WAATwMYAGAzgFc5529I0t0L4HYA+wFYCuABzvkkU5pWAF4AcDmApgCmALiTc15oStcPwBsATgJQAeBrNT/r0TCN0A+0ldW1+Fj1wmyortvmVptvw7R5m9Bjv1bovX8bd/kpCr4YvwInH9HNUz3qJLs5RAfQiH/52u2EM3PRVg95ir+RJKvstHtRV2fnhJA8w/FE8RegONFCE7v8I51XthsKt7r4OPRA0MPQc5/NwW9Lt2H0SxdJz1/36Di88o9TXNfLLPxNnbcJU+dtwiM3HYNjB3aVX2v67dY2VbvuxS8KcMqRB8Sdj777AXipO2Fot5Vcp/u3NNyJxYX647vVPYt/+m09Lj2tr+d6WlGiOqWtWL8bJx++v688zE2SBdC2Qr9DjhbSprEHwTDdcduSQwCcD2A1AKnrHGPseACjAMwHMBjAxwBeZYz9xZTuXgDPAHhLzXMVgDGMscNMWX4F4EIAdwK4CsD+ACYxxprr8moLIfC1ghAAhwK4BsBHLtsVOmYziKiRtNOAGuCA20IXUFKrh18S9op1uP7FLwpw10tTMWp6/HKgbPAqq6zF8J9X4sG3PNpJqFnJNvyORP9nzZaifRgydGRcINlkiDv6MCx6Zi7agiFDR0YDhzrnY3/eabuw9Vv3RG9LOn6XuFoGN/1NdHJO9m0w3+fGjYKVJvSTfBBt0Wu7rNijhr5wde8t0ui92Z3sBN1qumXvtP4D3Owz4eXd8boC4MZ5wsmWz7KPGpZovQupbpJqu1S0at44MBtuJ43dtl1lKNlbhbf/txDzuNjXV1F0S9LpGtvIB24Fu9Gc8+6c88sBzLNI8yiAeZzzmznnUzjnTwH4EMAwxlgOADDG8gA8AqHJe5FzPhnA9QDWAnhYy4gxdiyE0HcL5/wrzvkYAJcA6AHgRl2ZtwFoB+Aizvl4zvlnAO4CcBVj7BCXbQsV/UvtRkuXjHcvyMC2xw8SX8n7dWjukFJOvNAgvycfjIyPZm5nuuLbk9ji1jgF8dSWDsy2kMmJYyf+mp+jtnyyyWmJX3eZnb2RtvGA/j3Vt2brrrJoXmHLdcV7KqPaBjdky5ie67A7hNeubghi6+Gh7iguxw6bJWTbMrWlWF9XC8ora7CvvBrT5m3CRfeNwtad1sutrjV2uhsQFRalPgla/d23QJ9y3oodlmFwNAwWIm5sRx2kTQWKdDtAL0KqFy19eaX42GzetLFDSrvyjNi9+/P4Dtz6zM+44bHxGDerMJaHR3vmTMGVYMc5t33LVIHtdADDTae+hFhuPVL9fQKANhDLpVredQC+ATCYMaa9PucBKAUwXpduA4AZ6jno0k3inOutG78DUAWhNUx/9B00pCgm8cuf/l/0owd0AQD06NI6gRrp65LY9f9861ctJ28XOgxkTgOdFui0SWNTF0vCGBINWmqqVKNGomynSULPJz/qAzIbz0WXYi2srhVFv09muIPl/z0+AX98fILn67R6h11/vzi9l153FfBrEnLzUxNx89MTLc8nen+tAhRrfPzjMlzzr3HRDyu7LdicNEa/q1pGfY21PmDcUszoOJTI9/KE2YW25105T6gVGPPrWtzw2Pi48/Wmvit9JBZL3l6pqa3D4/+ZHX0OicbelGGXVeGWUulxRdEtnWfN511wzhN9ADRB/DKtNkv0V/8erP5dLknXEkA3XboVEoFyqS4vLZ2hTM55FYA1pnQZgWEQdfKs8ighvPPdQgwZOlJ6LkjTLD/j9QqdgXZC4RUkl2oDSSBeX+rfSCSCH39da5/WYuBK5lKsebBvogp21bUOXn+6ShXvsTZsd/J4/XTM0szdgStJFU++fGgswG95Uyy88o35xX7sKq3Au98vQl29v69Ru27upF3S1ySIZTwn4eJJLdyM7mZou7TInCcMJhsSpE03Ja528NT1IhB9OjY23bpZijVoJh0EaLes2liCucu3461vF6iFSCrkFVP9/bz79Uri1sDzVuxIMIfgceU84YJ26t8S0/Fi9W97Xboqzrk58qQ+3SY1nTkvLV173W+36RxZsiQ1GxUXFBjdtveUxr4kVvCYd2ppaWlcWgBYs00sL+3Zs1d63oqxMwul5QPApk2x5cKCggKUVRoHlV27drkua80GsfxSWlpiuKa8qh4zZs9BU7MGC8BjX26K/nvZcqPMP2/efEPd9Gi/KyoqoteW7mgirZd5uyyn9uzeJYTNoh1F0bTlZWI5Z8WKFdFdOqzy2rpNPNfNmzejoGBvNF1VTb3tdV7qqLFpkxBet2/fhoKCWNcqKxPlruCrgLJN0msBYMtuYe9SXl6O4pxY4NuFCxeiZbNcaTqtbus3xGKMbS4qQ4cWYphct24dWsM44MnaU6a7p/t25sWd9/KOO7Ft2zYUFMiXZ6ur1CDNS5Zgc8tG2L1XhNyora0N7DkF2RaNNWvWokl1zNFmXaH98ufefbFxY8vmmBbr5S/lFjb79sWW8QvmzUdj1UD9q2k7wTdXYsOmZrHzFu2THZ87t8DS2H3VqlUAYs9ElkepOm6uXr0KkXLrd7u4pAQAsHbtGjSp3gIAqDeNBYsXL4r+e/6CBdIxqqCgwCCMzps/H3mNc7B2W+x94pyjYnceqmtUm9aIvO1VlaJd+jmnaEeRIc3GTZtRUGCtZSwqivWtrVvldota2fX1uv1uFSVu3NTqXlsr3vlFixajdXPR77cWiz5fWVnp+P5uKBLtKisri0vL1bmtZM8+fPbDL9GPzr179mD16niPW+16vZBpznP1FmNfXrR4EVo3z8WCteVx16xcUyKt8+rVa1BVLdq4ZMkSbGklF4ns2v7GcHkEg2T0d7cEJdhlPAMHDkReXvzEEiQFBQXIz883HBs1bxawVXTSfv36AT+LDt6mdZu4tACQu3IHMHknWrduFXe+oqoWNbX1aN1CJ9x8aRz0Duh9MITsrDt2wAHAAjFQ5ufni3AE38cmiw4dOiA//0i4obLRFuDX3Wjbtm20fuWVNbjq4bEAIPeG09Wxf//+wPjYoHXEkUcA32yO1k2fVsu/2ZQpQEkNDj64P/p1bwcDanrxZa/EXWtVl44dOwCF5ejUqRPy84VfT/PpU4HdpaKOE2J1lOW1eOtSYNledOvWDfn5B0WffXllDfDtFld1kJ2fPHcjunVqAdazPX78dS2OOWQ/7LdrI7BoD/bv2hX5+QdjyZqdqKtX0KlDPVZu3oIePXsj/3Brr+C534uJbVtxDfr06ARsFAP+oYcdinatmkbTtdlYAozfgebNm0frtq1yHTCnJJqmQ4f2QGE5evXqhfz8Ho7t+fLXacCuEvTv3x+V1XWYUrARd199pO01GhNmF6J1i7yoXacx4/jJvkuXLsjPjznzb99djsqqWvTs2hp54ycCZeUYeMhAdO3YAlt27gNGb0Pjxo18PSdZ+XHpJGm8cuCBByL/0JhX4b7IJmCmdciT1q1i48aq3RxYbC08AECrVq2AHbsAAEcecUR054qxC34DNm9D23btgfXy/qlhaLd6/vAjjjDGDdNd17dvX2DaLjRtmgeUlcfnAWDk3JnA1iL069cP+f27WN7LNm3aAJsr0a9vX+Qfsh8AIOd/W4G6mLAzcOAgYKQQjo44/PCY3Zd5rNH9PuKII9Asr1F0PAaAg/v3R/9e7ZE7YgeAakQk9QaAvJ9+BvbV4pBDDgF+3A4A6NKlM7Ay9pEkxo1+8Q1S69ClSxdgxb7Yv5fF29FqZTf6YTuqVGEzJycSPZ43cRIAkQdjDI1m/Q5UVWPQoEHo2FYI7Ks3lQDjdqCFrs9b0XTtLmBiEVq2bIn8/HyDYMMYAyYWYVtxDb79dTeuOZsB2Ik2bdqgb9/ewPRd0rrX1yvAV5sNx6K02A5MjVlhDRo4CEvW7sKo3+bF5fPYl/LVqt4HHohGCxYBqMKgQQOxX4cWxgQu+njTpk2BPfFBtJ3uVyJUVVXZKqOCWorVNG5tTce1WXa3Ll0eY6ypi3TmvLR0+lHLbbq0xVIT7bQUK9Ef3/7CZFz36Djb62SBjZ3iL3lZ7YjFoIod21Pmfvuj+N0PnBXlm9XI9HZJfavbJVEFEvF2S3Rp7pWv5uHe139B6b4qvPfDYjz63kyd55oo68G3Z+CRd2eisUsbu+3FFloeta7bd5dbb2FlsUuJUzt37C7Hq1/PQ3llbbSoR96diUlz3AXqBoA3v12IZz753XV6M7c8PRF3vDgFQPJCU2SmpV4Mvd2Rvi2xZUd/LVRs1mK95OhkF+WmegZnAZeF3/3yVDzzye/SnSfsPEm37y6POnIYHFPMdXKoiBdnMGtTGw83J8H+YX5OZRUxraZd3lb3YcvOfVHbx2haeJtrohdleieVEJRgtwZANWI2dBoD1L8r1L/aOpss3V6I2HdaOqZzptCnW6H7vdycl+rI0ceULm0xBql0YRBrk0YLIGuHrJMHOalpDiD6dskielte72PrCTfOAUHaOrl5Tpb2KwHVQQucvHtPVTSERPFe49JETLBzF1kfEN6kZm55eiJuf2GKp8HdaWL6fupqTJqzMW4P1LQhQwb7OIHA5XVT523CF+Odh0jj1lWx3DV7Tr/96t7Xp1ufjNqoyV+48soaVFa7f6cBYzvMdrx+hNMtO8swa/FWSOQ6W/u/WYu3eC5Lhj6WpJdAyPpObCVbGwVOzTHLaw3tcWu7J6tiXV09bvv3pKh5UTStoqDeo82nAiUrY8YGItipDguTAVxpOnUNgG2IhUiZCeHtepWWgDGWq143nnOu3eGxEJq4c3TpukMEIR6ry38sgDMYYx10xy4BkGdKR6jIjY3NBv7+X3Sna52MgpPXxVLbebXSauvqsWh1zH4mKG9LbWKtqKrF2s1iGX3C7PWGNFpcJ+k+izr0E6h+I2z9VVahQ8w5u/1IKHMZWy/VaO9vTFGR4IyW6jnD4f3Sns9Lki2avKAZ7/udFNdvsw7BE+0jFrf+qofHxnbEcFzZMGqzAW9e4k7o+3NMW219T+yCexvztT+v19g55WjlaKGv56qNJVFtlyF2oWmZoqKqVpgpmFi5oTjuw9IOqwDTVnXUJ7Mazwwx6TyQqR7wdrjdeaI5YmFGegJozRi7XP09h3O+HsATAKYzxj4A8AWAEwHcCuB2zbuVc17FGHsKwDOMsSIIge8WCA3btVp5nPPfGGNjAHzIGBsKYI+a/wYAn+iq9h5EAOORjLEnAXQG8DKA4ZxzaSDlho5TRyreW5nQZCTrI/pDl/3zR3z37AVRWx3bxPE/Hcq2Th3EZuZW2a/bUop1W/bg9KO6x53TtCJDL+lqm0cyiC6X2bT9X+/OxIJVRZbnnTC3p0Z1DvGyvY8XXvqiAHx9Md5/6EzP19ruPGEKwBfUYJ/0HThM2e+xWjJX8SqoWm1BpR0Pol+Z0bKU1VSLfxath0Nebmone9SV1e7255WOd3aCXb3iKp3Te+P0sablH4lEDM/QqIGN/fvDUXJ7rajHvXrdYx/MwrJ1u+NspYe+ZqOBRfwHn9MHgVZ3swC4cFURtu2Sm44oUHy9j4m+wukYCcCt80RnAN+ajmm/bwLwCed8FmPsIohdJf4PwBYA/+Ccv6u/iHP+ImMMEIGEu0CEMDmfc77QlP81AF4E8DaEBm4KgCv0W4VxzksYY6cDeB3A94htKXa/y3aFRl1dPdZsNsbWWVa4yyK1Po34UvW7MbEshqNezb5qYwn6HdDWV96AOzu06po6S8HO3OG9TLB2KQOZfiwyueulqQDEzgs3DRFxsc31rlW/1AMTGFxko32p19kkthPqXAU+Nd2U0jLhGde6pX9HJKtyd5ZUYOo8/w4HtpOlUa5LCgtW7sDhB3VOYgnAByMcvPs9TkJmQbCouAJzV2wPNP6YZdmSIsxbrDlWIyoY2CXRCVvq33Wb7Z1KbIu0eYkMgp1tJvZlGDR2LrW0gPM2Y+ayY8pTcaVeow+IUFV2MQItcXgudfUKGuVGdDE6xfFH3p1pm6cbgddwidKANXbq/qyOPZlzPhYulkA55y9CCG12afZC7Cxxm0O6lQDOdSoz3fh64kp8PZGjTcuYB+vn4+JtXvaUVRu8XFeqm4P7JddhJDT6jvpAEnXd3HFsN/cOsI/py/War1bFcTMLcfOFA5HXONfxK/r7qaujgp1Vfma27NyHXSWVGNS3Y1ydE8Zs9JMElq41foxUqxo7za7Sysjb1tHF4ty/3rMZ1AMmGWP9v96bhe+fuwCNGwW32Xgq9+RVAAz7YCY2bt+HI5kQUL1OpO5LAmRTTkWlUZPm7Dwh8pq3YgeaNmmEQw7sIEnkq5KG/PX/jm5KIcnXEPfPptzPx6/A2cf2RLvWZj9DgVGws6qbNu5YLMValK0/rlg/CgDAfW/8YpGLPdruD3OWbceh6thnqINWsMslWy2pZ42dTrALamuzdCB7dr3NMAq3Cm1d6T7rpZPl63bjukfHYcbCmMFtohO/dKN4g6o+/ryX5Zto7Wwusf16TqR9pkuDmnNGTI2Ps+QXc/Nu+/ckPPRObB9bfZ2HDB1psM8z5OOiLG3SzQ3Y8nnt5lK8/d1CrN5YgpmLttqmfVMLSGpi5UbrDxSrtun3/0wZiZrYmRrj1sbKLc99NtdTes/NMTlP7C0TS6H1USEmeMHObueGOO1zxGlJUzDql7W6XWjkafSFu53jFdm/XZqEON25d39YZHnO4DxhkZP0qGTv67jrlPi8E9bQ2lz+4aillueiGjsXY5ii+HOESPwVTj+BkAS7kHDzdbBqk5j8lqyJLbt6+SKRaUucBLecSCQh4Upm92zOzm6QSKSP+QmVomfbrljQYf09qZFEmfdal5jyzNm2RM+YGevcF2pCe1dydPZupfuqXGtZ7No7bmahq9ACMxfFewHW1yu2e3daEbRnnp7o81GCXTI3E7YHnletxHrzMlvE8Md1hx0ydCS+nGBckVhRuBtDho7Ehm3GMuzMOfxoZByTSJ6J68ckkez0VTSPwUZvVqexwN25UdMtdsFxaoSLRu4scY60YCYIxVf8nAHsdbAfVRR/Np9JUTqHDAl2IbGv3L1XYPHeKjz2wSzsq6jxJFy8+338F59juJOENRTOam2p1tB0fex34nVxi9Wku7u0Eu/9sEi6P6QVI6evsaiU/LBsiyLZb+cTMbRJpZF6v8sqanD9sPH4yMJQ2isO+80DkA+ac1dst7/Ism2Je6iWV9Zg8Zp4+9RUrcKEPYlUVLlzCtAwa0m1rqv1by8T6Vc/ccNvLabmPO7svDNj4RaU7quSmHU4CEAu+qoxvIc35EKhOLZrTw0uuX+0wS7Uy9L1rMX22nDHukmOeX3PX/hceE+70Zglg6gjTSTiausu2f3dUmQdUkmBEpuz/FVRBPRPM2jniZCoqHIh2Knv6AxV6zF57gZPX/wFko4g05ZFDOddZy/FjcbOrog4wcZL2abUTmPoisLdKNlXheMGCo9VK03ixN83eKiFVd2Mf83U1NYjNzdHItj6lwSiS7Hq1k1lqkfhzEVbcOvFgxLSGAAul0ckD6HWIdzEtPnxQbSBxIUvBcBz/52LeSt24PPH5Wa5Wps320wGXsvUkwwvUi8sL9ztnMgCYbNlfAiJeFRrxD1XiT3Hs5/NwSEHdkALbVcIl/jtPm5tF/WPM2oWpv5jW4nob7MWb8GpRx7gKd8gcFTY+bzOD167bnS81CkKnMcrRZpm2AezHK9LBM9BkVMAaexSyIjZu1Ggaiscl0Qs3jUvgp1sEnGajO3s6eYu347Pxxn3ct22q8zUMSRfsKZjtk1PSENn+u0wid73xi94+uPYzgX6e+P0eJo28WcAbzmIaOEjXGos3dym+jrNPkV0c01wjQ2aLjKxIddGZRcNtSK9zv7mvvKVfN/SILRqhVuEbWu8mUIs8+XrduOZT+aYjvrEdAPqPAZQTTeC1NtEl1xNx7UlN/PzLiqpkI5/Mg2sp3q4cLKynvx116r/NodrsRoeEzMndnNxfBo3z0+Wd1h+BVFzkoiLECmQz3nVNfZ9LmwtejIgwS6FLFhbjsc+mA3AnTFq3PumuBsMFq/ZiXkrdkgHI6lAqd/9Ise6jMf/MxvDf14Z/b1uSylufeZnjNTZeEi9qDx0nLjO6+IrzXVeDrg1EJ7Pi3zLn1ZV0oKmBvm1bHae8Bp7zGnycOOUIXs+fpd1UuW1tlVna1m8twoLA9BKadTXK/j59w3YtMM6QG86MaB3++i/FejMKAKxo5Ko9wG89T9z5CtBbk5E+u7ahcBw059ciUiWAp+Liy3zjF08+hcLOzkL3PThAEzsosjGRqtdNKR23D77bsyRJgLHbyLFaqnbuqGGcCcRYaqxw2qLxQyCBLuQcPOeyzqeVYf+8dfYwPDQ2zMw7INZUsFm9C/xtl/GOEeyTimvn7ZctVwXfy8m1/nsyB7T241vdgPXlIL4PUndChz/HbccVR63NIot08jPX/uvcep583JyIkuxYiTUBLCoxk6ri++cBW7ul+z5+BbsfF0Vw+4jQP+Om6v3no13omOZpt/19cBrw+dHYx+GwYr17pdju3bUbYquKNFnEIiI7fACmsvIicgFOztc9R+DFk1R/xqTrN5U4nSpZyEvkf4nM7Oxyt8qQLFVDYZPXGm504yeNZtKHdP4xezEtKes2nkpFvL50YuAe8+r03HzUxNd1y9dIcEuJJwmt+27y7F6Y4nhmAL5QFVbV4/3flgcd1z2kq+WdEaD74SDMbIsf/3XnOxaLwsCcTZmTpWwC3Vgc+7lL+OX+1KhD7LTgr357QLXzhNunpH29bpx+148/p/Z8c4ffo1wVGy9m6NFyJ5+/HWlLkKZJMPbTkN7l/eV10j6pvj90eilGDJ0ZELla8J2kNtaeeW+1/3FHgOAHep+1EFoT50+As1l5OoC1kbTBNBr3Qx3ljsr6C6urK7F356fHDugmSOEJATIy41gzaYSWyeayXM3xm05J+vrqWiV/nk7yfSKooCvLzYcq6mt91RPt7a1aS7XkWCXKtbqdplYt6U0LoK3mc1F+zBtvjHKvkFtrCMuJIGK649b/VKshwE7KtgZJkJxLKJ7szztHuFtJda206aj7QQvLLY8N2H2+kAnAS2rz8evwNzl27FWtS/TljTMJR3er5On/GUfJ369mp/S2TomDV1dzO/5FjX8ysTf18ed037+EEA8w7DDnSSCf/MD+ZWyfUDtyInEC3ZBoK+fV0N4vSZv1cYSbNzuYYk92a+CJP+a2jrc/co0/PuT3237ZpVpT+/q2jrsM4UbSarAKvn2dN6GDNExTmNL0T7beurP2H0kmDcGII0dAQCGWE1O++rZIRNWrF54t8sWcRo71x5hIp3e1spun0c3eO0wxk24E8vLD7Ko6XY8/7l9QFnzI3OaFO0wp3n1q/mG4+YsWjY3ehwqANbr3luzHZDX+7t1Zxk+HRPuFs5WNT7q4C4AgEP7dYoTWBMJzmq+RWF7xYaBkwOQ27urKArWb/Vmm+j1HTVo3Fzwv8mrdGVZ1UH3b/1xj5Kd112HZLlrWntH72jTxcvW7cY1qrkIIByMUiHbGDR2Dn1H+qw9rEDZMfS16SjRrSqkey8mwS4E/C/DyDc5tnrh3Q5qTjZ2VtRFvS511+jc092UZ8Zc40RCcnj9uvfTWc3CEAAMn8jjjrnNO8g4fmbMAT65ydZK9szueGFK9N+T5xrtEu2qJntuT340G/+bvMrgnJAuaF7OOZEAouwbMN6H5GzBlRr0j9TLLXJssUVm5sPrt+11DFIbV3Yqb3fcMrGn5FIWrirC3OUikoJXhUAyP2zvf/OXlHw4G4RiFzZ2ZiJwuM8e2lBZHVu+dhN6JUxIsEsRQYzniiIX4qwmCz/agR9/XetaColp7GKvkexSu3d84m/rjWm9buKs+7dZKE1F35JtqfX5+Pg9f93eU3Od5/EdWGdaXkgUrYgH355hn87hBsrerxUmGxc9idqVWVVnV2kFvp200tPAv2N3zPPt20krDfZe5hXmSASGJbZEBm3zXqeZhO/JzEllZ0EQ9nNu4vYFNU7EZ2N0VhL/9lbYI+/OxOP/mZ1QveJrJOpqK+8koJOSjQtev5W0HPR5Oc1n1TXxDm3iY9W7gkN+3t6WXE/Yn28k2GUQVjZ2Vi+8exkp9sJOXyAPDivNX2JjJ3eesK7IhNlGwc6rLGq2+9DjeTAxLOsG70pR6SLqv+z56pd7AiGgmcyt7Yq52KCD2L/weQE+G7schRa2psZKiD/6zcs/G7vcaH9nqmAEEdslOr39bFxxphvxwJv+HRfSHa/ym3b8s7EWy/Mpip0WlHZlwUqrfZ1F/jW1ddiwLZgPBN/ox7UEi7d674O0g9TfI6e5QbYvdSQSnK21/nV0bGPIkh0Jdikj8Se9bVeZ9IWy1Nj5WIoF3NdUE+zGzyrEtf8aq16sLcVaZ6g/xePsRrwtRf770zmxfE3tSGRJLWi5ToG75+H2mXmM4mCgzEJr5LXJTnGlVm00PttYvKhgb265uqOGc1gDFzdNshQbMY2S5mze/k4ed02WNoNXYg3ItGmypq3cXGH5YaI9j/IUajFlzhFBPRIn7eCb3y7EotWJBVP2gqKIuWFXaSx0SUR3LtFdMOZbCbIBCHZaHvrx0CnfjdvjPVpzHMI8eKqph9isYXdzEuxSRBAB58fNKpTmY6mxczmLFBV72+hZ29dUL1DuVfe+1S9p+cHrxKd3b99ZWoE/PfUTtqk2XInsbxj4x3QiApvfuiTYBqcq2wqhiogJJatOsDZs3rCazPRtiRPsHOpr18/CHuCDxOgEIFPNxx/7ctoufCEzTXCBq9fE46t03aPj4g8m+SFpt2W5KRKCl2LNoa9clQsbT24fzldukXUH33FNdXn5cjyKONkC65IGaTcaMiTYZRiyzpaoQfbXJmN/pw7960IRcVw2sWvHbDX+kQjKK2uwq1QiUCrmn+7bNnnuRhQVV+An1W7PbT9Nxq4PfnFbh1TsOblT9nz0dbD9Eo4/p9lPBr0U6/aejZ1ZaHlOmzQikfgB3qm6dgJuOrxTiaC3i3TaeivopoYn/idnmTRuBz4PRfzj1WneC1SUuPArsViWri73RZD3zhjHznu+W3eWBRYzUi+cJrrPdrJpFG7xDYlgnrRVgOJkYtZYaDGONK9YDUXo9+OQdYLbX5iCnSXxgkP8lmLu6xlb6Yt4urRkbxU6tWtmOJaMpVhX6ZJkn+IKU5udYnr59Tr2vb2Qw3k32Vo5L+g/jsyaXidTBVuNXdgjfIJMnx+zuXWOI+bxfQjigQZAKj6SgNiezdFyk60phP+PDouh3F25sunI56M0Op54v17bwjMIDMqKNO/WJNiliKBsa2T5pDqEgjaRyTasl64kSg7KhDqR2Pjz9W8WWNbDvMSjD3g6duY6adgRGbm58d5rqUC2J6ZsIA5rDHGMG2XzPSHV6MjsLwMkEbtD7eNIeMXGO0/4LTfNx39PGJbFZO+px8ami9CbdAErGj3A5LWf5LdDUaz7qHDEc5GBD6TCpI+s1m/bg607Y3Zzyblf3vOsrqlDmWrXG1yuwUKCXYoIavCQhjtJssYurg6aYaupLgqMWjNFUfDpmGXo172dIZ3dNGnuvFoMJxnmJeQN6rJDTiSCd75zv79nWDZf42cVxh1z/Z4kIMS4vcBp4q3zKNEkyXfCm22MRZXrPWjsRCaxg3YfVmu3OW+VlikYDNlle3IGXF6oS7EB5KG9N1peidj8+kFRFEthyM0qj2+NnaLgv+OWo7kSe/e368ILucvDGEMzWejHg6kFm6wTIvY8//HqNIN3szzjBCuWICTYpYxgnrR+sl29sQR9u7dFbV2wb5Fs8tMHs9UG9TghU1GgtTMSEQ4V302JN961+9pJxB6iyOc+llGNXRJ7YyK2c9ozX7p2l8dCE0vuZm9GK2RCX+xIsBNcEB9N+onO3C7n98m6AttL7L/sMwmj84TsfLD9J1XfW+Z6+3FUcINZsKupSe4H+YZtex0c45JjJ1avAN/8vNJw7NnP5liklrNph0RwSrKw9IluV5y5y7dbxg90FOqQuuV9K8h5IkUkQ2O3bXeZeiz5Grt7dRuHaxoKs6ZCr7EDrAd6u44RhKGr1wkhSI2dfgsuP1i9J1U1dfjnW7/G0iVUijuclmLtzg97f1b8wQQ0dmN+XWvY0kdGIo8x+i5H4peSnGzs7Pp28b7MDUhsxhBTTKax8/hSWoXLSDU7SyoNv305Ktih3hfzUuxz/7XfXjBRHnpnhqsAzTJK91V53uVDIwgBXz/fhMFHo5fEHUtGbNNkQYJdighqItZPOtqLFrSNnflrwxxYVyvOzs4mWQbybvAaXiH6JR1A4XbLB27Gu0S3hzNck2CDXvlqnn3+njWCmjbX+7vx7g+LPV9jVQsZeo2d2SnIqb6KAqzZVILvJdrplZsrJVdkJkE7T1ja2aoEsfOEG8wmHYDYPD5RamrF/dDui/kD0q/gFBR2j2tHcQX++PgEX/kmYz/kJo1zUZMEk6PPxy2XHs+Nc2H2RtjmoyTYpYqAnrR+0smJAKs3ldhGvg8C824U9RZLscIgVxwbP6vQ4FHnljA2SU/F9OG2VTLbl1Qse/mhzqOmWEueYlMjA1a3LfpBAucPpTjHbUXB3a9Mw8c/LgUgdhhIJTMWbUlJOfp2m7VPycCN/F9Vnfi9lu1YsqKwOOF8xxWUJJxHJpKMoam6pg7DJ650TuiRnaXyDy+ZPWTm6OtIsEsZydDY1dYp+Mcr02zjc/nCobJaHeInQKOO6P0Rci2L3YBtJyzMWhy/L2sQvPntQtTXh20VIbjrpalxx+rrFXwwIn5pwIlky37vfu/eQQWIaSh27wnWocDVVmIqbm6J+ePCrGkxC9XmPC994EfX9QmCZz+dgwoX29Uliv6+VEtMJoJ+3xo1cp6eRk1fE2yhKhXVwd1PBcLBze+yaLJI1vCQHiNpYsg+XGYvcT//hH0HyHkiRSTDxu5Vh6UyvzhVdcS01dhZUhF1OtAoKq5IWJNkpwR65pPfE8rbimnzN+Gc43vGBfMMlARuy4rC3Sh2sC8LAq/Pzq+A9uEo90JqbV09GuXGT/DJCvETicTnbf4QeX34Agzq2zF2QJd8nMTTORVc+dAYjH7poqSWoX8/ZI48QT8RN7avyfp4mbEwOC3ogpVFqE3HveSSdPPCWHUJGplg9/Z3i/DLApfvBS3FNhAC6kR6D1jZV3MqKN1XjTEz1mGvKYDt8J/dqcr//Ym1h9SYGWsTqptfHnp7hqcQKV6prKnHGzYx+dKBdDQOvuT+0dLjK/xqPyzXYmP/NE9M5tsybf4mw4bjeg3F25LYhKliyNCRSc3faQgL2jygsQuNXU5u+r2zMtLBdCJVZIFch1zJxyQALF7jbq/fsG8BaexSRFAPOtm7TADuB6Epkrg/bi4t2Wet6dm2y1u8o2QRtJDz0/xSbNrpz1haZu9hdZ9vfCJm8OxVq5WK5Tw/yN5H2Rd1MM8sgrr6OtMR/wGKswkn54myimBDu7Rv3dQxTSps/QKhgbwjQHYIsam2kw0a0tiliCx4113xXwsvo0wj6MHJr1AHeBNYdumMgb22wS4YdJisksQVK97rz9vUjaw7ae5G4wGH299AurbjGHbrMz+npiI6UuU5myhrkuzg5oek2dhlQYdYuaEkoevDvgck2KWIVi2ahF0F1/h9KSebJ8QMZsLs9WFXIYpMKeFq+6wsGGABYOhr0+OOPWOznO+HtVvExBuJAPNW7DCccwxPnC032oHXhs9PaXnutojLjHuvj0GZ7TSU/mDHO2PD/UimpdgU0altM+dEBCEjDW3f0pXaunpUJhAC4+mP4x10Ig7LffbR/bOHZIdV8kO6mg9kAuWVybl3Tkv2DYG9FeEu5ZJgRxBpzg6P+yxqaFooGXrj/2whAmtni0TzJVKPG23ckjUet9kjkk7pvnADLxO0FJsyzHvnEUQiuJn07IK3ptNSc1AkS0+wbVdZknImCCIb6dEpXNMrEuxSxDy+wzlRmkA2EkQmkqz3toQ0EKFAwxCRqcgiGaS0/FBLJwjCHzTppQzzXskEQRB2hB2FhwQ7Ig6SGYhMJFkanmTtckEQRHYStl0uCXZEHJPmZE/YkmyFRI14GooJwdnH9gy7CgRBpDEk2BFxLF3rbtsUgkgnGoZY13Ci3zQUQZ0ggoYEOyIOWnlKf2jSk9BAbkk67ulLEIQOsrEj0o3Vki2cCCLdyZRdCBKloch1DeNpEkTwkGBHEERW0FCUmDkNRbIjiAwl7B5Kgh1BZCD1tF7eYAl70kgVm3fsC7sKBOETimNHEIRHvvqJh12FtGPrzoaxQ4TT3rXZwioyCSEIX5BgRxAZyKLVRWFXIe148YuCsKuQEpK9EjvsluOSWwBBZDlhW0uQYEcQGUhDsScj4km2jV2fA9okNX+CIJILCXYEkYGQXEckDXq5CCKjIcGOIHzQvUurUMsv2VsVavlEeJBXLEEQdpBgRxA+oLmVCIuDe7dPav6ksCOIxAh7eiDBjiB8EHbHJRouzfIaJTV/2tWEIBKEnCcIgiAIt5C2mCAIO0iwIzKesO3dCCKVkEKNINKbsL+9AtPpM8ZuBPCx5NRbnPM7dOkGA3gawAAAmwG8yjl/Q5LfvQBuB7AfgKUAHuCcTzKlaQXgBQCXA2gKYAqAOznnhQE0icgQBh/fC++PWJzSMmkjdiJbIcGRIDKbZGjszgVwvO6/F7UTjLHjAYwCMB/AYAhB8FXG2F/0GahC3TMA3gJwPoBVAMYwxg4zlfUVgAsB3AngKgD7A5jEGGsefLOIdCW/f2fb8+ed0Cs1FSEIgiCIkL/7k2GFW8A532lx7lEA8zjnN6u/pzDGegAYxhh7n3NezxjLA/AIhCbvRQBgjE0DsBjAwwCuVI8dCyH0nc85H6seWwxgDYAbAbydhLYRaYiT9oz1bIexMwtTUxmCyHBIY0f4pUv75miUG8HmooaxvV+6kjIbO1VgOx3AcNOpLyGWW49Uf58AoA2Ar7UEnPM6AN8AGMwY02bx8wCUAhivS7cBwAz1HBECRw/okvIynVdFadmUINyiUMATwidiLKbxNuw7kAyN3RLGWCcAGwB8AuBpznktgD4AmgBYZkq/VP3bH8BcAAerv5dL0rUE0A3AJjXdCs55vSTdOYk3g/BDJIRX2ilgazL2TCcTO4IgCCMRRJBDLpmhE6RgtxXAMAC/A6iDsKH7F4DeEEuj7dR0JabritW/WtTNdgCqOOcVNuk2qenMeWnpPEfwXLJkiddLCAklpSUpL3PxEnvHicLCwsDLLC83v54EkRpWrlyZ1PwXL06tIxKRPVRVVaG2NuxapAcFBQWhlR2YYMc5nwBggu7QRMZYKYDHGGNPBlVOshg4cCDy8vKSV8CXm5KXdxrRrm1bYPO2lJZ52KGHAiOtyzzwwN7AzN2BltmieXOgpDTQPAnCiUgEOOigg4DJVmbMiTNw4CDb/kQQVjRtmofGjXKA0r1hVyVUIpEI8vPzk5Z/VVWVrTIq2UrTb9S/RyKmcWtrSqNp8rSZtxhAHmOsqYt05ry0dMHO4kRa47QsSqFJiGwhAiR9zy/aeYLwiwIab9OBVK6GrwFQjZgNncYA9e8K9a9mWydLtxci9p2WjumcKfTpVoBoMDjZ2CVlnKGxiwiDBF/mRrlkAEUkF5LrwifZvfxqCCG+gHNeBWAy1HAlOq4BsA3APPX3TAhv16u0BIyxXPW68Zxz7XNyLITG7hxduu4ATlLPESEQRqd2+kJMhkNHTjI8MggiydCkSySbMBzoCCNB7jwxAUJwWwKgHsJ54m8APuScr1WTPQFgOmPsAwBfADgRwK0Abte8WznnVYyxpwA8wxgrghD4boHwqr1WK49z/htjbAyADxljQwHsUfPXvHGJBoLjZJWEcSaXZkgiBBJ969xc77QSmxMB6jNgtbZ1iybYU1YddjUaHBFSCodOkI9gOYA/QdjVjYCIWfcAgOiuEpzzWQAuAnA0hKPFLQD+wTl/V5+RGpj4IQB3ARgHEQrlfM75QlOZ1wD4ESIY8bcQmr8zOeflAbaL8ICd9uy7Zy9IeZkAhTshsodE37tIEJ0hQ17+p/5yQthVaJBkxtuRXHbtDdc1OEiv2LsB3O0i3Vi4WCpVhbsXHdLsBXCb+h+R5jRpnJuUfJ3nKlqKJRoOZx7dAz/P2SA950pj5+CdkSlvPhnxhwPdd2BHSU2o5ZPSNA0ZfHwvtGjWOOxqZAxhaOxIsCPSFTvBzNWc67DMmm7zdqPcNKtQAyfd3o+GCAl2aQjr2Q4XnXxg2NXwRTjOE07nk6Cxo9GLCIVIgh99Qby39O4T1pDzRPiQYJeG1NaZd0nLHFo0Tb2m0VFwo3GGyCL6dm+Lf/3pWMvzds4PkQhwzID9bPPPAL8IV2Rqt7//+qPCroJ/FKC2PnPnr2yBBLs0RFGAfZXhrtH75eqzWMrLJLmOaCho7/oxh9gLZ1YoCvDIn45xSGMv2qWbFUK2xVPusV8r2/OXnto3RTXxx5pN2bsjT9Mm7uzEe3dJ4i5WLiDBLk058+gentIf3Eu+Pe5RB3cJojquadw49a+Uc4Bi+/MtyZ4xaeQ1ycURBzYPuxopp1Xz5LxTbmSqehtJp6a23rE/OApKSTZDGP3SRUnN3yu3XjTQdVrWs51zIiciwBv3nmZ5Ol3se3NyIvjb5YcZBc0UVu2vlx2KT4ed45wwQE4+vJvhd67Fs7jipA6pqI4lJNilKb33bxNIPq1bNAkkH7eEISQ5Bih2GGwOOTDcTpjNRADkSYT9Vs3Fe3nWMd4+YIJk6HXJ28vxlCMPsDyXbMeo+jprySwIMw9zf/IqaNh9tJ53Qi/P9bESZP0o8i6RaMMuOOlAPHv7SfjqqfPw5n3WAhcAdOvU0kepwEmH7R/9dwTBfxhce07/QPMDgMP6dsTg43vhpiGH4NGbVdMABdivQ2o+5Nq0yEP71uadR/3hVng/oLPx+V5xxkEAgL9ceqjhePO8cEUrEuyyHNkXResWTZL2Vdy4UXJCmtjh1nmiRbPGGPXihbjstL6m8+7KCUoA7Nw+uQNf144tkpq/mTuuOMzyXCQSwWmHto473rpFE/zw/BDcccXhSayZNa1bNMGpRx6Ac47rKT3vdsnFEhupwosc9Pb9p3suWm/j1Lypc0SrCxN01BpyUnCOXieZNCJu0Mt1+mVMP3vetpF8COfkRHDIgR3Qsllj9Nwv/l3W41eZuW7LHtdp3bSrp2k5d0Bv+YpOIug/qLvpBJ5zj+sVeFnyCgSXlVkT55brzu2PkS9ciIN6tA2uMgFAgl2Wk8q9If9whL/OkSjOW4qpKAoikUhc1Hyn6x+5SdgklVXE7B4Tses5bqA/+yi3NG6U2m7dsW0zw2/9pBKJyDV27VrnoVFujqW257LT+qZEk9rfYunsDI+mEGYmWsSREyQwI+neVav7U1RcEf13n25tHbO88syDDL+dBAdz7d0IM6ccEdNg2i0VD0zwmV+XgGaqzwFtEn7ufj1CNxftw/+dJ7ZHb9e6KTq0aeZwhT3P/O0kw++ke/HrHmluBu5H3LpFE5x+VHfpuUF9Otpem5MTCUWhYUfmPYEsxEpr4IVO7eQDQa4kxpPbPj7sluM81eE+G2+uls3Eq3bCoV095ekGxx3F1ASN1QDJ5onLadBrp6r795RV4e6rj/BVRz11NktlAHD94P5o18q/8W2TFAt25nn6jisPtzzndLxX19b4+F9n44bzBqBZXmDx0y2pqpEvTd584SGJ5VtdZ3kuJ4HHo39VrQS7VRtLov92CjYsw6uJ3fkn9o5LY/640NfDTnDUPrL8yiF6odHp48u8JH7PNUeiaV5iE3Qi8tPlp/fDd89eEDVTkHH4QZ1c5WX+Xkq2XV5NrehHrVo0lq4S+f1Is4tRaNeiPw1x339vvXigrTAaXWa2oVdXe01uqiHBLg2444rDceP5AxLK44bBB0uPd2lvvSz3wUNn2uZ5aF/7LxUv/HWwcOK4/4aj0UKyPGSeSJ+87XjXeTsNpi3VgVLTKNabVHZO17dtKYSsft3bRQVoBYrtFmmP/9m6/v17tTdoMMyclt8dHdp4sx3RL7/Kvh69LMV4tQFrqbMHatsqD/17xsqqqva2tc4RrDM6tm2G3JyIr6U0r5xpYeOX6Be4+X0+UWdDVesg2Gv03j9+sjj72NhH4CkmDfkFJwkBy24s6XuAs+1uba1R2L3+XKMWTK/hHv3SRehk0tge1q8j/nHNkZb561tvXjKUlaHxxRODLfPUaK4Lt+T0/uT37xxXrya65965fXN0tOmH+mdhxklr/rfLDo07FolEHHfnefgma49mg1mD6f7ZjXFu7qsezVGvdcuYANpjv1a46syD8OAfj5EKY3q7Oy9C0N1Xx96jl/7+B9fXebLzc/yScZ9VukCCXQg8+Mejo//+6knRqS47vV/0mJ/pzGoguegPMfsXs+Hnfh1aYPRLF1l6zgYZ2LdFUzFg5eZEcLtuALrijH4Y/dJFuPgUo93b4QcZB10Zrw89FW/ee5pjPVs0bYzPhp2DP10gJluzYOeksWvcKAdv3Xca7jUZ2+sH4SNZZ3z/XEzQO8Lmy/rUIw/AvdcHa7j/xr2n4cBuYtKWaUW9LMmfeXQP9OraGh89crb0/BCTTVb/nu2jE5V2J6PLswm8Q7J+cCRzfi/MPP3XE3DxKX2k5/Ia5+ICicYJAIY/fR7+drm1/aAd5vdZb8Mj0+Y9d8dJUcFM4/iBxuf4+ePnGoy8e+zX2jCWdGrbHKNfusgwlui1wx8+fBae/uuJjnWvNmkxI5EIjuxvfd/N/e/YQ7rGPzzdb0WX/aH9OuGea+OFQJlQZuUINuyW4/DS3/+A+284Ckcc1AlfPDEYnw47x3Ycvf+Go+K174rQbGmCyYcPn4WPH7X2upRpzzTNmJOzwgCfGqymTeRa7Iv+0MdoAmFRLzOd2zf37GD31F9OwJCTD8SfLx4UKy8SwfWDD0bHts2QI1FJ33JRLK2dTa6ZZnmNog4pzfIaGYTQPNUO9o+SDxmZTD/yhQt1dTjcdR3080OmhNYhwS4E9J0sL1EjbQcikQi6dhDaHNmSif21yaiRe0NVq2XPvt3bYvRLF6H3/m3Q0+XXX7vWTaPqdvOA7GaZosd+rdFUtzRotqVp3CjHoOVJSCi2GDxusfHcymucG71uoMQmxMotX0b71k3xxr2nWS7v5/fvjC4mBxAtrprW7DvVgVOmdQKME/eb950WtW8xTOiy++DjtuZEInHPQ++1etul8doTQGh/nDSnfsItyDxTB/TugM7t7LUMbVrmeX6v9GV1bt/coNGyorrWKHial3PNH5E5ORHDx6ECJc6OTjH8O/YrAqC5ZMndbgI1a/laNmuMg3q0w8mHd0MkEkHrFk2Et6RNHicf3s1SK/vds0MwQicEWGH3KGSrEgYCFhBYj3aGvmHO3urj1esKbf+e7dCkcS7+fPEgyyVjmcauWZNc9OveFoC3sdEs4Ldu0QTfPHM+brtkUPQj73Ldh0z0Okle+nHeSzdKl/AyXiDBLgSSYchq11natRZLiVYvqNWSRdivs6Uhs8fPJvOtOdakCXF6HLLSzJOd287vOuyApFJul2dlNfFm0Ox8f51sfZxslfSPsOd+raXehjIjez/vZCRiXNZ9959n4OYLXcYmc7gVXgRmDbNXts+i49IcNyjeKafO7CkkwTx21JiWYs2PQeYgoF/6UpT4utfbCOzNPSz9f/jIWXj+zpNt6xcrxp/0lJMTcfVcZfdBG9sVAJedELwnql+sBTtv768bMw2re+dXltWqqPXhZnmNcMFJB9oLiA6Feem2+nu0dN0u9xeGCAl2KeLlu2P2Afr3MTcRS2odbt5Tq45g2QdSsB/qhm17PV8TtDZcJpRpHmpmrLze3C51XnO2y505ZEKNU2BZmzuT6o3SA9kvUtIcP5pQ80TTspncwFuGnQenLG83yJaOgPjupijOXVCzAbv2bIb9O8bHUDObHcho1byxQZtfXSNx/FCzufAPB8afg/F17dCmafzHou6n+Z568YTt3C5e62gZxy6MZTNNCKlXMKiXtQY2kapJ2xUxLnGbE1l9eHrtT27SS+e0SCRaJy9FKroyvTxPZ6Fet7zqkFJ/7+Ys2+6+EiFCgl2K6Nc9FlZB3zlSqeb1WpIsvdfwKUf274y7dF6SGprn5m9Lt3msVfCYv1pbNGuMP9g4NwDxg4yfCd4re8qqXaWTDb5BhyAwt79tyzwc3Kt91Ng5kiNPZ4XmZJGns1v0q3Exk5NjXIpNZClIlneiHJhAMHLWsz1e+vsfcKXFVn49usidE/ReipFIxBBgNW4ZVffzSNZZOjBo1xx7yH448dD94+6b0Ss2drxZXiPp8zDfk2NttlCzekZ2nsmJcPygeBvW5+44CQ/deLRBY2eHH693W6cDxeQRbDptZYMdkF7BgCwSAxCrk/6jz2kVomPbZjGNnYc6OI07siHAzbBgDkScrpBgFwJBLcUathFzkaVlsWonMAdZlKV32sfQzOO3Ho+zJN5jiUyIijxChW+0ieWmCw7Ba/eciq+fOg9d2jePDr6GicOi2vrBTLMB8rIVkRsqKr15mOppFOAIrijxk2lubg6ev/NkHKHavTg9XfP1eapRePcu+gCz8df56Trma7zk4aTwct2X1Xw6t2sWJ8g8pHo6mrVr/Xu1QycHuzsAOKhHO8sPi7uvOQK9urbGq/84xXDcSuADgOMGdsWlp/aNOXPEhQeKv+bKMw7CoX074u6rj1CXvo3njSuxsR9XmGLoaTz11xMMv+3scq0eUXebNnrJxxL1PuREhI3k8YP2j773ToJFm5Z5+N+zF3iysTZvM2YW1vTvj7l8q3vhdcXITRgR2cd/BLoxISKCYv/9qsOjbXj/QXmEhr4HtNXdU+ub+sD/GUNt1TnssCL7mHDySAaAwcf3wu2qQ1UqwjH5hQS7EAhqhbONzt1c/xVk9orU+kOOhUpbC/kwoLdxSUSq2ZD0rYF9OuCMo+XBHa3QjLplAp75i9gcly0S8Fur1aFZXm7UsxSQPyfN/qixafDSBsjXh56KZ28XwUEv/EMfTx5gg4/vBUDsFnDpqRKDYIfZwu50x7ZN8dCN9pu/a+zvYlskrSwr7zKvSzxDTu6N+284yhDkOqiltJyI/9Ap+usO72ftBelIREw+z90h7MNOPfIAHHFQJ1x6al90Vp1UzLZt+f27RDWYXiLb6+vcuFEu3rj3NPQ5wHi9zBGne5eWuOy0vmiUm4ObhhyC6889GGcc3R0XmTyKZRNgx7bN8PRfT4yGFiq3+QjR97E8i8m0VfMmeEUnjPoZM9u2yjPssHPducFsq6V5abZtmYerzjoIL98dq6dmL+hG25zXOBeKzZeDlXOCdoXZrEO/y4hM6JCZY1hp1/QxTD9THYT269DclbDc0mI7tPtvOBoXnNQbB+7fBrdePAhnHhP/wS/DzVhi9haukpkT6Ni/Uyw8lNZdrDyO9eTkRHDu8b3w2K3H4am/nOCYPizSV+TMYpzsdtyiKMDVZzGUV9YYjj/4x2MwZOjIuPTN1I5vnkBOPrwbTj68G0ZMW+1Y5p8vGYSnP/4de8vFsqDZ7d0tfQ5oC76+2BBC5Lk7TkIL07Y9r9x9Ctq1zsONT/wEQNgnnXjo/nH5aYx4fgguvn+04ZjT5KuNG27skbTlHfPkpmlMzHv86uPLmXdoMPPnSwbhijMOQsvmTXDyEd3w/OdzDeed3ptjB+6Hwq174pZ5br7wEJx/Ym80bpSL9q2bYveeyrhr+x7QBqs3lQIQGhuNa85m+OonHpdem7isbm1UuLA0bDfSuFFunFbmxEO7YvGanYZjfmz3zM/fS/fTP897rhPLzDW19aisqsVvS7cZ3oPjB3XFrMVbo7+bNMpBta6vnXRYrH2yfWpr1I+dS07ta3Cw+Pqp89BEsnuHFW7aJxPO3r7/DMPvFs0aG+KIaWzbVe6Y/wmHdsX7Ixajc9vG2FFSA0VRMPzp8zB/ZRGOH9gVn49bEXeNef7ue0Bb5PfvjIIVO+LSfjrsHPzx8QkAgEN6u7PR269DMFvtXXdufxzatyMOObBDXODd0/IPAF+/Gycd1g3rVpWgSeNcg83iFWf0M3jj2g05Hzx0Jq5+ZGzcce1DwKzN1QvvjRvlYOQLF+Ki+0ZFj33++OC4/Mwa57OO6YH8g7sYvJzbtW6KP188KOr57kT/nu3xt8sPQ5tIEf797RYA4tl27dgCt13ifynT7r3O798Z3Tq1xOaifQBEyJ03v11oW0cz2kfGgd3aYO3mUtu65PePDxF24P5tsHZLKbp1Su2WjjJIYxcC7VoJuwLN/duMl6Xa687tj1svHhQdFO1svbR4RVZBWbUtJo8ZsF80eLGmPTvq4C74/PFzcciBHQyBIv0IdUBMXd9VN9gO6N0hzjuyb/e26NCmGW66YADOOa4nLj+9n+VeqAN6t5faksnCSHymC1Oh3e9mLkJBaLEATzzMqFU8V9W2mTm0b0zL06dbG2kajUa5OYYQI6NfusjTnr7Xnt0fnz9+Ltq1bmrYTP3iU/pGJxOzkKPFaTvCIj6clW2TNshavaqa8NhetaH5dNg5+ORRXVw8F8LH+ScdiM8fP9dwTF/e3686wjkTyMOduKVbp5bRD4m8xrlo16opOrdrjh77tY5uAP72/afjnmuPjAvo7dWuUQsM3LpFE7RpGRPOWzRr7ClgctSWyaHJ157TH6fl29uSRlHzcuNlCwAd2jTD6JcuwmmDRH9WFBE+5sRD95d+aN115eF46774fXG1pTrzvdRv/u7FrMPOVk+2T6yMRrk5lv3lgM6t8NRfTozWz7yd3v+dN8CgadPGk1YSLVeLZo2lQeLPO6E3ht1yXFyAajPm+yLzaNXmi/Zq5ISuHVtIP5yHnHxgXIgju3IHH99LupWgH2IrTdbvXiQSMezg1LaVWOo2k5sTwUV/kMe01PaHlsUgffimY3C/zc5KAPDEbcfjyduOx7v/tA/8nwpIsEshD1/VDV89ORi992+N68/tj3/qgosCwH3X56Nj22bRmF53Xnk4/mWznYl56RSwF+xyc3Iw/OnzLEM9aB2nW+eW0a/bv191BG67ZBAevfnY6GQThG3BXy87FENOPhC9HYQdjUtP62cbVPLjf50d3e3BTYTydq2b4tC+HTH0unxce05/XH56v7i9bo8fJAY4fXu7d2mFb545H6cfZRSOD3TRjjtNTiSP3XpcNHq9m51HmMW+pgP7iPcgJycSfUZ/vUy+BHymLoRMs7xcnH1MD/xpyCG4+iyG14eeahS+ABzQpRW6d2mJq3XG+U0a50S1glbhD/Iai3umCbPt1f0vtWUst4o3vYCjp1O7ZoYPFP1+p41ycww7HzRpnIuObWOCQFMHuybz7gz3XHsk3nngdMsYcN27tMJp+d3jtDf6pa+uLrRFJ6gT6jED5AHD3ZITEfa3D5rGFzPXnM1wz7XuAmVfqgZc7tOtDf6qBqN20880ecy85PfYrccZTAPOOrandJlP26VDFuvODw/deIwhRp32AXTHFYdFtw4MEm3pHRABr81oO0nceaX8I0Ubq1vq+pkWN9DNx8pNFxyCoyXvk7abiGZ28e/bT0LblnmG+I5B8PwdJxs+Ms1o5bXSCdUnH94Npx55QOzjOxruxL4sTbB77FaxjCxb5h/xwoVxZghHMCHInXdib5xxdHdcdno/dO9iNEc5bmBXnOwgSLdpmecqsH5KUBSlQf83d+7cXnPnzlUqKyuVZDN37lxf123cvkeZt2K7UlZRrfy2ZKuyt7xaWbu5RKmvr4+mqa6pVW56coLy/ZRViqIoSsneSqWqulZRFEV545v5ygX3jFDWbSm1Laesolp5+csCZW9ZlWOdtu0qU2pr66TnVm0sVh59b6ZywT0josf8tt0vF9wzQrnv9enK+z8s8p1HbW2dsntPhW2aucu3KRu27bFNc9u/fzbcCz319fWO1/++dKvyn5GLFUVRlMWri5QL7hkR/a++vt7wHuhZs6lEWbu5xHCstq5e2bRjr3LL0z8pMxZuti3XjL7M6ppaZeJvhZZl19fXKzMXbVFqa+sMz762rl559/uFytad+1yXO3nuhmjZS9fuVPaWVysVVTWKoijKzEWblS/GL1dqa+uiaTRWFO5Svpu8UlEURamrq1cmzC5UyitrpGUMfXWa8ta3C5QL7hmhfDlhheu6mVm3pVTZXLRXURRF+e/YZXF1amjMnP278spXBcrOknJf1+/YXa68/8Mi6VizZM1OZcmanY55LFmzU7nt3z8re8ur485VVNYoo39ZY/keJ4L23tfV1Su1df7znzJ3g+U49Mg7M5QL7hmhlOwV89cF94xQbn16omVeKzfsVkr3ifF99uIt0X4k4+ff1ysvf1ngu95uxvy6unqlQu2T2jxlfk7T5m2UHnfDlqJ9yoVD5X2wYMV227HXbmx1ItnzXWVlpTJ37lxl7ty5vRSJXBNRQgn2kz4UFBT0ArBu4MCByMvzv/G6y7KQnx/sVlJuqK6pw9K1uyyXD1JBWG1PB8orazDr93k44w/Om0m7oaKqFne/PBWXntYX5xzXK5A83TBq+hoM7NPRlXZSTxDPvrKqFjuKy9FDEshY45f5m3FQz3aul4xkbC7ah64dWgQSxkRRFMz+fS76DxgUNb9oaDTkfh9G20v3VaFJ49y08NhMl2dfXVOH+nrFsHNQskl226uqqrBkyRIA6J2fn19oPh/+0yeSTpPGuaEKdQ2d5k0bo22L4Lpas7xGeM8iPEAyudDCNiUVNM1rZCvUAXBcKnFDNxcewW6JRCJo0iinwQp1ROqxMl1oyLgJY5JtkI0dQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQIIdQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQIIdQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQDtPALkAUF1dnZLCqqqqUlJOOtKQ2w407PZT2xsuDbn9DbntQMNufzLbrpNXpNtq0F6xBQUnAfgl7HoQBEEQBEF44OT8/PxfzQdJYwfMAXAygK0A6kKuC0EQBEEQhB25ALpCyC9xNHiNHUEQBEEQRLZAzhMEQRAEQRBZAgl2BEEQBEEQWQIJdgRBEARBEFkCCXYEQRAEQRBZAgl2BEEQBEEQWQIJdgRBEARBEFkCCXYEQRAEQRBZAgl2BEEQBEEQWQLtPJFkGGP9ALwB4CQAFQC+BvAA57w81Iq5hDF2BYDrAOQDaA9gDYB3ALzHOa9X03wC4I+Sy6/gnP/PlN+9AG4HsB+ApRD3YpIpTSsALwC4HEBTAFMA3Mk5LwysYS5gjN0I4GPJqbc453fo0g0G8DSAAQA2A3iVc/6GJL+Mabtal6kATrE4/SDn/FnG2GMAhknO38c5f9GU3/8BeAhAL4j36AnO+XBTmsYAnoB4n9pCRFb/O+d8gd92uIEx1hfAvQCOAzAQwArO+UBJupQ/62SPIU5tZ4zlAhgK4HyIdjcCsBjA45I2FQLoKSmmE+d8py5dWrRdLcPx2Yc1xoX97NU0drsYHM85n62mmwr5eHE053yuLj9XfZwxth+A1wCcC0AB8COAu/XvUSK4mdvUdBnX50ljl0QYY20hHlwriAc5FMA1AD4KsVpeGQqgCsB9AC4AMALA6wCeM6VbC+B403+T9QnUF/8ZAG9BTBKrAIxhjB1myusrABcCuBPAVQD2BzCJMdY8qEZ55FwY2xUVWBhjxwMYBWA+gMEQguCrjLG/6DPI0Lb/DfHP9G313FhdugpJui/0GTHGLgfwKYAfIO7TzwC+UgdNPa9ADI7DAFwEoBqi/fsH1io5h0A8l9UAlskShPGsUzSGOLW9GYRAvgDATQCuhpjgJjLGLpCk/x/i34cSU5p0aTvg4tmrpHSMS5NnD8S3+XgAswHsADDXlHaGJO1yUxrHPs4YawRgPIBBAP4PwC0ATgAwijEW8dFOGY5zW6b2edLYJZfbALQDcLj2lcEYqwXwBWPsSc750lBr544hnPMi3e8pjLGWAO5gjD3COa9Sj1doX24yGGN5AB6B+Np5UT02DeLL/2EAV6rHjoXoGOdzzseqxxZDfE3diJhgkUoKbL4SHwUwj3N+s/p7CmOsB4BhjLH3Oef1mdp2znncQM8Yex3AYs75It3hertnr/IkgG855w+qv6cwxg4G8DiAcWre3QD8BcBdnPMP1GOzAawDcDeA+xNojhOjOecj1TI/AXCUJE0YzzoVY4hT2ysA9OacF2sHGGM/ATgIYtL50ZR+u8NYkE5tB9w9eyD1Y1w6PHuY26wKHkcAeJ9zXmtKXuJwj9z28csAHAZgoNZOxtgWCMFxMIwfln5xM7dlZJ8njV1yOQ/AJJNQ8B3EV4JZU5GWmF58jfkQauT2HrI6AUAbCJWylncdgG8ADNZ9hZ0HoBTia01LtwGiQ5/nqfJJRu3QpwMYbjr1JYQ6/kj1d1a0XV0eOBrA5x6v6w2gP3TtV/kSwNGMsU7q77MhNreO3k/O+V4IwSGp7dcvvcgI8VknfQxxajvnvE4v1KnHFAgNnh9Natq0XS3btv0eyLpnb8EVAPLgcRxQcdvHz4P4gFyqSzcTwHoENBY4zW2Z3OdJsEsuB8Ok3la/AtZATHSZyskAdkOo4jX6MMZKGGM1jLH5jLGrTNccrP41q+WXAmgJoJsu3QrJgLMU4d2zJYyxOsbYOsbYMHWZAAD6AGiC+CUMbTDS6pvJbddzPYB6iIFNTzPG2A7GWC1jbAVj7HbTea39VveJ6dJt55zvkqQ7iDEW5ngV1rNOyzFEfRYnIL6dAHAdY6ySMVbGGJvAGDvSdD5T257qMS7d2q9xPYCVnPPfJedOYYztU5//r4yxM0zn3fbxuLbr0iWz7fq5LWP7PAl2yaUd4m1LAKAY3rRdaQNj7CgIO5tX1K8SQHzl3AvgYgibgE0AvmbC+UCjHYAqznmFKUtNE9Bel65EUnQY92wrhB3IjRB2dj8A+BeA/6jn26l/S0zXydqUaW2XcR2AaZzzTbpjqwE8AGEDciGAWQDeZMKpQsPLfTKn0dI1hhgkwyKsZ52u78SdEAL5S6bjowDcAeAsiCWlAwD8whgboEuTiW0PY4xLp/YDANRlyJNhsqFVmQaxnHoegBsARAD8xBg7XZfGbR9Pedslc1vG9nmysSNcw4SX0ncAfofOwJRz/pop6UjG2GQI+6lPUlbBgOGcTwAwQXdoImOsFMBjjLEnQ6pWKDDGjoP4gn1Gf5xzbl6OGcsYA4AHGGMvcM7LUlRFIkUwxk4B8DyAFznnv+jPcc7v0v38hTE2DsAKAP+EMILPSLJ1jPPBtRACW9wyLOfc4B3PGBsFYCGAx2ByMkk3rOa2TIU0dsmlGMKd20w7CHVvxsAYawNh5F4O4ELOeY3DJd8C6KGznyoGkMcYa2pKp30V7dalayvJL13u2Tfq3yMR+yJra0oja1Omt/16AJUQHo9OfANhp6JpabzcJ3MaLV0NgH3uqpoUwnrWafVOMMYOBTASwoPwAaf06pLbZIiQEhoZ2XYJyR7j0rH91wGYxTlf65RQXT4cCffPXt/HU9Z2m7ktY/s8CXbJZTli6+8AokbYfSC+YjMC9YUdBaAzgHMl9hFu0OwPDjYdHwBgL0T4BC0dY/Eu7QOQfvdsDYSrvqxNQKy+Gd121abwKggPuj0+srBrPwBwXbrOjDHzcsMACJueoIzc/RDWs06bMYQx1gdCgz0PwA2qA4UfMq7tLsnaZ6+WfThErDs/ThMabvt4XNt16QJru8PclrF9ngS75DIWwBmMsQ66Y5dAeBQF4a6ddNRJ/RsAhwIYzDlf7+KaCISL93qd59FMCI+gq3TpctV043WTxFiIr5VzdOm6QwRqTId7djVEsMwC9Yt0MlR3dh3XANgGMQECmd/2cwB0hPsB/WqIEBlLAYBzvg5iQDIbm18DYI7uHfkJwjkjej/V8ANDEPKzD/FZp8UYoi5V/QTR1os559Uur+sI4AyIILQaGdV2GSka49Kt/ddBaNXMXqJSVGHkYhifvds+PhbAICZCImnpjoMIbh5I253mtkzu8xFF8fvRRTjBRLyfJQAKIeJ4dQbwMoQr89Xh1cw9jLH3APwZIr7QL6bTyyDUw59CBF5cDfHi3gLhbHCD3gaLxYI4PgjRKW6BiFd0LOd8oS7djxBxkoYC2AMRpbwdgEE8hTt2MMYmQHTsJRCD0WCIoL0fc85vVdMcD2A6hJ3NFwBOVOt7O+f8XV1eGdV2PYyxryCM4bual+AZYwUQz59DeJBdBTEBPMI5f1qX7gqICeHfACZCBCb9O0RMp3G6dG9CGF4PhQhtcC9EbK1BnPMtSWxjc8RCDtwO8XV8j/p7Dud8fRjPOhVjiFPbITwEZ6nHrwewXX89j+08cA1EoNdxEFqKXhDLtd0gdh+IahvSpe1qOU7tB0IY49Lh2WvCjuqtugEiptuFknxOhgj0+4Na3/0g+nc+gLM451N1aR37uCp0zYVwqHgQwh/gBYh378QEtMX6OtvObZzzPZna58l5IolwzktUj6DXAXyP2NYgyQy0GjTa18XzknOnAVgE8bXyCMQLWAPxYl/IOR+tT8w5f1E1rL8LQBcIjc75+hdf5RqI3R3ehvhKmQKxdU+qBZvlAP4E4dnXCCKa+AMAXtUScM5nMcYugujU/wdgC4B/6Du9mi7T2g4g+jV9IYBPLewqV0N4wnVVfy8F8CfOuWErNs75t+ok8hDEQL4GwLV6oU7lHxB2Nk9BxIaaA+DMZAp1Kp0hbKb0aL9vAvBJGM86RWOIU9unQgSLBYRtnRltaWkdRFy7lyEmq1IIT8nL9UKdSrq0HXBu/yiEMMalybP/RP33qRAC+j2QsxXiw+4ZAB0g7NVmAziVcz7DlNaxj3POaxlj50JsKfY5YluK/T0IoU7FaW6bmql9njR2BEEQBEEQWQLZ2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCf8Pn/9H/koRoC8AAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"Latency\"])]\n",
+    "ax = df1['issue_to_done'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Inference time (usec)');\n",
+    "#ax.set(xlim=(0, 25000))\n",
+    "plt.xticks(rotation=60)\n",
+    "plt.show()\n",
+    "\n",
+    "ax = df1['issue_to_done'].plot(figsize=figsize)\n",
+    "ax.set_title('Individual inference time (usec)');\n",
+    "#ax.set(ylim=(0, 200))\n",
+    "plt.show()\n",
+    "\n",
+    "\n",
+    "# df1['issue_to_done'].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApwAAAFKCAYAAACwxI8KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA8y0lEQVR4nO3deZgkRZn48e8A0oBcw+24ooL4Ao6iDKyKuIsiIqDIKoegIigs7iIeC4oiIiIiyqh4gNeq4G9REF1PEA9AVJDDwUWG4wURmMVZEbkVbJCZ3x+RNZMkNdPdNZVd3T3fz/P0U12ZUZmRkUe9FRkROW3hwoVIkiRJbVlh0BmQJEnS1GbAKUmSpFYZcEqSJKlVBpySJElqlQGnJEmSWmXAKUmSpFYZcGpSi4hjI2JCje0VEQsj4nODzofUi4g4LSJuGXQ+JqqIeEp1jh8w6Lw0RcS0iLgqIj446LyMRkScHRHfGHQ+ND5WGnQGpKYxBJAHtpqRCSIiVgPeBfwsM3824Oy0KiLWA44EXglsDDwAXAF8MjPPHWTexiIitgSOAZ4LPAG4C7gRuDAzjx1g1iaMiHg58GbgH4G1gbuBy4CvZOa3B5i11kTEDOBfge9k5v+0sIp9gacBn2xh2W34MPDriNgqM68adGbULgNOTUSvb7z/V+B5wBsb0y8B/gs4cTwyNUCrAe+v/v/ZAPPRqogI4HxgPeArwBxgOrAfcE5EfCQz3z3ALI5KRDwfuBCYD5wG/AGYAcwC3g0cO6i8TQQRMQ34PHAw8Fvg05QyWh/YFfjviHhtZn5tcLlszQzKuXwL8D8tLP+dwDcz888tLLvvMvPKiPg1cASPve5rijHg1ISTmf9Vfx8RLwH+sTm95u/t50ptiojHAd+kBJj/lJmX1+Z9HDgDODIi5mTm2eOct8dn5l/H8JGjgb8C22bmnY1lbdjXzE1O76AEm58B3paZC2rzPhIRLwMeN5CcTWIR8Rzg2ZTjbzI5CzguIg7NzPsGnRm1x4BTk1pEHAu8PzOn1abdAlxPqfmcDTwDuAl4a2ZeEBF7AMcBTweuBQ7OzDmN5T4dOB7YEXg8cB3wocz85hjytg+lNmMTIIEjM/O8Rpq1qjR7AhsBt1Fq907IzEci4inAzVXy90dEp6bzdOBjlBqiPTPzW9Xyotr232XmZrX1/D9KIPfk2rRtgQ8ALwBWptQovi8zL2zk8QnAB4GXUwLC3wOfyszP1tLsQKnV2w94KvDvlJrKi4FDMvN3IxTXq4GZwDH1YBOgKodDgJ2r/J7dWOeL6k0NamV2YGaeVps+4j6t2uV9pUqzB7APsEH12RuAwzPz443yeRZwFfDvVZlsClzbDDarbbm98dndKcHXcyg1fH8EvkHZD3+rpTsNeA3lmD0V2AG4DzgxMz8VEc8APkW5E3An8N7M/H9dtuvFlLJ+DTAE/BA4rJmvbiJiP0qwOBP4G/BT4F2ZeXMtzdOAE4AXAutUefkV8JbM/L+IWBU4inI+vKMRbHbKqHmOrFctc3fKrfffUZpYfLGW5imUff6eqlyOoJxPlwBvAuZV8/6Nclz+BHhjvSawdt34KHAS5boxDzg+M786ivJZ6nlSO14BvhIRX6n+/0CnmcUyXnf2AB4BLmjk61ga18hq+gGUY+KpmXlLNW3rav3bAmsAtwM/B/41Mx+s0kwD3kK587QZpby/T7m+/bmxjp0o5b4NMI1yDn02M/+zluwnlOv0zlTntqYmOw1pqtoE+DpwDuU25trA96ovzU8BX6O0sdsEODsiVux8MCK2oLQleybly+dwyhfn2RHxulGu/wXAZynBw3uBVYDvR8T2tfWsSvkCOoDSNOAtlC+LYym3HAHuoHxJAnybctvp9dX8uZR2b/9UW+8/AQuAp1VfgB0vpHxxdNb9z8AvKEHBcZR2k0PAj6svxk66DYBLgZdRAp23Ves9NSK61aS8C3gV5Qvkw5QA6IwlltJir6heu36xZ+a9wHeBLSJi01Es71F62KefBrYGPkQJgm+kBE7d0r4OeIhSUwPldulzImKrUWTtQGCYcky+lbL/30G5Fd+0AnAu5fbzOykBzScj4kDgR8CVlP14H3BaFfw1fZJya/844AuUIOXHEbHy0jIZEe+mHKM3U8puNrA9cHFErF+leVyVj+2BUyg/Ok4FNqTcSoZyXqwLfC0zR7wzERGrUM6RA4Ezq+2+HfhClaem1wCHUWpPP0Y57s+m/FB5BWXff54SFH68y+c3Ab5FadpxJHAPcHr143Fp+RzNeXId5ZoDpew75/J/V8tY1uvOdpQfOg+OIm23bVifEvxtWq3/LZTjcCYl+O34LKXsLqu28wuUH8wXVvurs7zXU46HDavlvQu4HNitseprgQcpx4amMGs4NVVtRqnR+wVARFxHufh9GdiiUysTEfdQvoBeRKmxgfKlPB/YpnbxPiUifgycGBFnZOZIHZtmAttl5q+q9ZxG6TRyIuULGUpgsTmwdWZeX037QkTcDBwfESdlZkbENykX+d92aW5wMY8OOF9IqbXaoZp+VkQ8CXgyJXiqt6H7JbBTZ1uqnvW/odQmbVct73hKIPrMzLyjmva5iPgicFREfCYz76mtfxVgq8x8qFrm3ZSgaGZmzl1KeW0J3JuZty4lzVW1tDctJV03Y92nfwF2aARFXwU+GxFbZua1ABGxAqWjxjmZeVeV7qPATkCnfdovKEHT+fVay8prM/OB2vvPR8SNlP3/zsz839q8xwFnZeYHq3V/vdqmLwGvz8wzquk/odTUHUD326s7ZOZwlfaa6vP7A//ZJS0RsTGl5u7YzDyuNv1M4BrKcXwUZb9sAuzVqJE7vvb/ltXrb7utq4t/pZxLB2Tm6dV6T6Wcy8dGxBcbNcn/ADytc0xWPyTfQ2kH/ZzMfLiavgHwmog4pBGgbQbsl5lfr9J9gXJOnBQRZ3erka1t40jnye0R8UNKsP+rLk2ElvW6sznlLkWvtqP8AN05M39dm965q0JEbAccAryhXusbEedRjvP9KdewNSlB/5XAC+tlXF1/FsnMv0fE/7L42NAUZQ2npqobOsFm5bLq9Wf1W4C16ZsARMQ6wEsoNZOPj4j1On/AecATKbc1R/LrTrAJUH0pfg14QURMrybvTQn6/txYTyfw3WEU6/kF8Kzq1jyUIPMCSm1LJxB9YS0twFZAVPlZt7beNSk1HM+NiNWqL4Y9KbXECxt5/DGwKqUXdt1XO8FmY52bjLAdawD3j5CmM3+NEdI9So/79ItdauDOotRG1js37EAJchbdvs7MCyhl/gNKsHR49f/tVW0ktbQPVHlcISLWqvL0S8rtx627bM5/1j57D+XW9N8otfmd6UmpmetW5p/vBJuVr1ZpX94lbcerKJUTZzXK7l7gasqPNSg1qwA7R8TjuywHyjEGI+/rjt0otfyLgrPMfAQ4mRLgvaSR/luNH0Cd8/u/OsFmbfrjgCc1Pv8nFtdUUwVK/1mle1a3DPZ4njSX0Y/rzrqUOx69urd6fXlVW93N3pQfY+c18ng9pea5cyy8lLKvT2zWuC4haL6b0tRBU5gBp6aqefU31S1ZgP9tpOtM7wSBT6N82R9L+aKr/32sSrPBKNZ/Y5dpN1SvnXaUT6fUhDXXc+kY1vMLynm8fa0m8+fVXz3g/FOtFrXzxfWlLut+W7W8dSltCqdTRgdopuuMndfM47zG+84X4HSW7n5GDiQ78/80QrqmXvbpY2pQM/Nu4HvAfrVamtdRhjw6p5H2ksx8JaUpx7MpNY0LgS9HxIs76SJiZkScS/kSv6fK00XV7LV4tIcz8/8a0+4F/tCl5u1eupf5o47LKqi+GXhKl7QdnePleh5bfttQlV31Q+7jwEGUH1E/jYi3RcS6tWV1gtLR/mh4MqU98iON6ddVr818N4+/0Z73HTd1KcvOedtcV0cv50lTv64700ZOskQXUTruvR+4MyK+HxEHN348PB1YnRJcNvO5YS2PnWYvS7ur0cz3hBpPWf3nLXVNVc0vqJGmdy7UnR9hn6C0l+tmtBfRkaxAqY388BLm/34Uy/g1pf3TP1GCm/sptwDXoNxyXIcScP6ysV4obVuXdAvujmp5UGrPvryEdNc03o9UvktyLfDsiNg4M5tBQ0enhqlTLkv6glqx8b6XfbqkdnBfBfYC/ikiLqN0wPlao1Z3kapW7Srgqoj4FaVt4OuAC6pa6QspPdrfS+kM8yClNus0HlshsKTbub2W+Wh18rEL3UeEWFRWmXl4RHyZ0sHnpZRg6eiI+OeqGUInUHwm8J0+5a+u1/N+WXTKZyznyZKWsSzXnT/T/UfGqM6TquZxr4j4R0qN906U9pnviYjnZeafqnzeSWkr202vNazTWdw5UlOUAaf0aJ1g5u+Z+dOlply6zbpM69QUddop3gSsMYr1LPGXf2Y+HBGd2+drAZdUvbovpQQHr6S0jfpi7WOd2rv7l7buiLiDEsCutIxlMRrfp/Rw359Ht/nr5GVNyrZcmZmdfdT5clu7kfzJjff92qdQbm/+iXJbfUPKbcP/t9RPLNbpfd/pQPMiym3EPTOzU6vZ6dnbls0ot3k761qJMqrARUv8xOLjZV6n7erSZOY1lADrw1F68M9h8VBIv6Tst/0i4oQuNZdNt1I6YK3YSLt59XrLSPkZo00jYoVGLWfnvF3SusZynizpXO7HMXodZV823Q0QEWs3mhs0zxMAsowScTlwTETsQgmAD6a0Ab+JEohempl/WUpeOsfMTErN+BJVx+CTWHKgrSnCW+pSTfUr/kLg4Ih4YnN+1ZNzNLaJMgB453PrUgKqS6pbs1Daim0bEbt2Wc8aETFUve10KlnSbelfUHoe70TVE71qN/VrSk/badR6qFMCgN8B/xERj7m12dnG6gv+m8Ae0aXH9RjKYjS+RQlS3h0R2zTWsyKl09R0qo5PlVspNVf1TlNQekcv0sd92rkFfQalzd6bKLd7L2ks78VVZ6Kmzn7ufAF3Aqj6kF4rAP8x2vz04JDacQUlwF+bRpOAhm9R8npMs8MHLBq2iIhYswoe6q6j1ICuDYuOyw9TAsaPLWF5L43yFCIobV/Xp5w7nfkrUJp+DLO4vXO/bEAZCquzrlUpTQRuYwkdncZ4nnTGc33UudynY/RiYMsqz3Wd4G/ReVLdJn9DYx3Tu+yPK6vXtavXsyhxwzGNdETEirX26T+mNJ94dzM/XdaxJaWz4SVoSrOGU3qsf6NcvH9b9TK9ifJF9FzKxbHbcDNNc4EfRMSnKbUf/0q5zf2eWpqTKEO1fDciTqcEgqtSagX2otx2vCUzH4zSm/g1EXED5ZbWzZnZ6RDxCxYP8VQPLH/O4mFyFj02LjMXRMSbKLV111a3QG+j1Lz9MyUA6jT+fzelY8yvqrK4hvJl+WzgXyhfFMusqql9NaWJwS+rPNWfNPQcyniI/137zL0RcTZwWJTHod5EuRXYra1bP/Zpx1cpNXYvpftTgz4FrB4R36YEXCtQOgC9nrLvTq7SXVy9P706Th6mBLKrjyEvvbgwSg/3p1CGEJpLGde1q8z8fZQhiE4CnhwR36G0N30qpdb5LEo5vJjSq/qblM5M0yjB2xrUOuJQhlTanBI07lDtw/mUwHLnajmdAPOLlHPnS1EGNv89ZSinHYH3ZJexTpfRjZShjJ5DOSdeR+lg99ql9FCH0Z8nN1FqHP8tIv5CuTbMrUZwWNZj9LuU4Z9ezKN/QPyY0rb1SxFxEuXHQ6e96ca1dG8ADq2O25so16IDq/TfBMjMn0fEKcA7q9rrH1EC/6dRjt1jgNMy876IeBulicGvI+JrlGP9GZQmI6+qrXcnyo+SH42wfZrkrOGUGqpevttQOojsz+IxBVcC3jfKxVxcfWYfyjBDw8AembkoIKxqe3YAPkKpfTiZMrzMFpRhaP5YW96bKLf0PkZpK/ZvtXm/otw+/xuLb9vC4h7iFze/LKt8PI/SQenfKUOYvJHSAeYjtXR/onzh/Sfli/4zlGBrI0rv676pyn2rah07UcYzPIkSbL4hM7uV/WGUL9o3U27Fz6NRc1Nb9rLu086y/ofFtV3dnn51BKW3/86U4OqT1facQXn60C3Vcu6i9ML+X0qgcBSl1/f+Y8nPGL2N0sb3/ZThbb4HvHRJbVA7MnM2Zf8/ROkA9XFKIPUzFg/WfRVlSK5dKdv9QUrQuUd9mKTMXJiZb6IEq/OBt1PaCh5O6Ty1e2dYomoYqRdRAuLXUo7/J1AGIm/jkba/p7TL3ZEyvNV0ygMElvqYzdGeJ1Wb3tdTztVTKOfyntW8ZTpGszyL/EpKT/L69Icp++omyj55a5XPzzQWcRHl+rE35Zg9inINenHtxy2Z+RbK9Wgdyh2HEyk/vr5BbdD5LA9deDnlmnIUpTyfT2k+U7c38O1ax05NUdMWLrRjmKSJKSKeSQmcb6WMqzohvpQi4grgocycFINVx+Knyjw/My8dIflyKaonDWXmywadl15FxL6UWuEnt1D723dRnmz0a2BWZv5m0PlRu6zhlDRhZebVlJqwAL4dIzwRZzxExLMpNVFLvA0tDciZlJrMtw84H6P1HuCbBpvLB9twSprQqh7cfWkruiwiYialc9Y7KD3Vu91OlwamGtpoNI9UnRAyc69B50HjxxpOSRqdPSm3pVcFXpOPfiSlJGkpbMMpSZKkVnlLvQdz5swZArYF/o8lP8FCkiRpIliRMsLDFbNmzRoeRAYMOHuzLYuHnJEkSZoMmo86HjcGnL35P4CnP/3prLxye51m586dy8yZM1tb/mRhORSWQ2E5FJZDYTkUlsNilkVRL4eHHnqIG264Aar4ZRAMOHvzCMDKK6/M0NDQSGmXSdvLnywsh8JyKCyHwnIoLIfCcljMsii6lMPAmgHaS12SJEmtMuCUJElSqww4JUmS1CoDTkmSJLXKgFOSJEmtMuCUJElSqww4JUmS1CoDTkmSJLXKgFOSJEmt8klDE9jqa63H7Xc9MOhs9MVqQyuxxuPbewyoJEmauAw4J7BHFk7j/CvmDTobfbHjthsbcEqStJzylrokSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWrVSoNacUTsBbwWmAWsA9wEfBb4fGYuqNKcBryhy8f3ysxvNpZ3BHAosBFwDXBkZp7fSLMGcBKwJ7AKcCFwWGbe0rcNkyRJ0qMMsobzcGAYeCfwcuA7wKeAjzTS/R54fuPvgnqCKtg8ATgF2A24ETgnIrZqLOvrwO7AYcA+wAzg/IhYrV8bJUmSpEcbWA0n8IrMvKP2/sKIWB14S0QcnZnD1fQHM/PSJS0kIoaAo4GTM3N2Ne0i4GrgvcDe1bTnUoLR3TLz3Gra1ZSa1QOAU/u5cZIkSSoGVsPZCDY7fkO51b3OGBa1HbAWcGZt2Y8A3wB2iYhp1eRdgXuB82rp5gEXV/MkSZLUgkHWcHbzQuAu4E+1aZtGxD3A44G5wImZeVZt/hbV63WNZV0DrA48EbitSnd9p31oI93Ofcm9JEmSHmPCBJwRsQ1wIPCBqoYSSo3nFZSgcC3gIODMiFg1M0+r0kwHhjPzwcYi765e16EEnNOBe7qs+m7GVqO6yNy5c3v52Kituub6zJ8/v9V1jJc771yN227uVqk9OnPmzOljbiYvy6GwHArLobAcCsthMcuimEjlMCECzojYCPgWcDm1TkOZ+clG0u9GxAXAB4DTxi2DSzBz5kyGhoZaW/61N85jxowZrS1/PK277npsuNnGPX12zpw5zJo1q885mnwsh8JyKCyHwnIoLIfFLIuiXg7Dw8OtV5KNZODjcEbEWsAPgQeA3TPz4RE+cjawcUSsX72/GxiKiFUa6aZXr3fV0q3dZXnTa2kkSZLUZwMNOKsg8XvABsDLMvPOHhbTabu5RWP6lsD9wB9q6aLWiaie7voe1itJkqRRGFjAGRErUXqSPwvYJTNvHcVnplGGObq11sv9Ekrv831q6Vas0p2XmQuryedSajh3rqV7ErB9NU+SJEktGGQbzlOAVwDvAlaLiOfV5l1LudV9OmWw9t9RgsWDgB2A13cSZuZwRBwPnBARdwBXVuk2BfarpbssIs4BvhQRhwP3AccB85gA7UElSZKmqkEGnJ2axo92mfci4LeUmsujKbfcH6YEk7tn5vfriTNzdkQAvBXYkNKrfbfMvKqx3H2B2ZRB3ocoj7bcKzMf6McGSZIk6bEGFnBm5lNGkeyVY1jebEowubQ09wOHVH+SJEkaBwPvpS5JkqSpzYBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrVppUCuOiL2A1wKzgHWAm4DPAp/PzAW1dLsAHwK2BP4AnJyZn+6yvCOAQ4GNgGuAIzPz/EaaNYCTgD2BVYALgcMy85Z+b58kSZKKQdZwHg4MA+8EXg58B/gU8JFOgoh4PvA94DfALsBXgJMj4s31BVXB5gnAKcBuwI3AORGxVWOdXwd2Bw4D9gFmAOdHxGp93jZJkiRVBlbDCbwiM++ovb8wIlYH3hIRR2fmMHAMcGVmvqmWZmPg/RHxhcxcEBFDwNGUms/ZABFxEXA18F5g72racynB6G6ZeW417WpKzeoBwKktb68kSdJyaWA1nI1gs+M3lFvd61SB5IuBsxppvka5bb519X47YC3gzNqyHwG+AewSEdOqybsC9wLn1dLNAy6u5kmSJKkFE63T0AuBu4A/AZsCKwPXNtJcU71uXr1uUb1e1yXd6sATa+mur7cPraXbHEmSJLViwgScEbENcCDwiaqGcno1655G0rur13Wq1+nAcGY+OIp0zWV10q3TZbokSZL6YJBtOBeJiI2AbwGXU+s0NNHNnTu31eWvuub6zJ8/v9V1jJc771yN227u1opidObMmdPH3ExelkNhORSWQ2E5FJbDYpZFMZHKYeABZ0SsBfwQeADYPTMfrmZ1aijXbnykU/N5Vy3dUESskpl/GyHdxl2yML2WZkxmzpzJ0NBQLx8dlWtvnMeMGTNaW/54Wnfd9dhws27FP7I5c+Ywa9asPudo8rEcCsuhsBwKy6GwHBazLIp6OQwPD7deSTaSgd5Sj4hVKMMebQC8LDPvrM2+CXiIxW00O7asXq+vXjttN7ulu58ydmcnXdQ6EdXTXY8kSZJaMbCAMyJWovQkfxawS2beWp9fDYt0AdWwRjX7An8ErqzeX0Lpfb5PbdkrVp87LzMXVpPPpdSW7lxL9yRg+2qeJEmSWjDIW+qnAK8A3gWsFhHPq827NjPvA44Dfh4RXwTOAF4AHAwc2ultnpnDEXE8cEJE3EEJRA+i9HLfr7PAzLwsIs4BvhQRhwOd5c8DTmt1SyVJkpZjgww4OzWNH+0y70XAzzLzVxHxSspThPYH5gPvyMzP1RNn5uyIAHgrsCFlqKPdMvOqxnL3BWZTBnkfojzacq/MfKA/myRJkqSmgQWcmfmUUaY7l1Hc8q6eMjR7hDT3A4dUf5IkSRoHE2YcTkmSJE1NBpySJElqlQGnJEmSWmXAKUmSpFYZcEqSJKlVBpySJElq1ZgDzojYucvjISVJkqSueqnh/CFwW0ScFBFb9TtDkiRJmlp6CTj3AC4GDgWujIjfRsQRETGjrzmTJEnSlDDmgDMzv5eZe1MeIXkwcAdwInBrRPw4Il4XEav1OZ+SJEmapHruNJSZ92fmlzNzR+DJwFHABsDpwO0R8dWI2LFP+ZQkSdIk1a9e6isCjwOGgGnAg8BLgJ9ExG8iYmaf1iNJkqRJZqVePxgRawF7A68DXgD8HTgHeHf1ugDYHfgE8BVg22XNrCRJkiafMQecEbEHJcjcFVgFuAJ4G/D1zLyrkfw7EbEecOoy5lOSJEmTVC81nP8N/AH4JHB6Zl4/QvrfAmf0sB5JkiRNAb0EnC8Fzs/MhaNJnJmXA5f3sB5JkiRNAWMOODPzp21kRJIkSVNTL4+2/ERE3LiU+TdExEnLli1JkiRNFb0Mi7QbcNZS5p8FvKK37EiSJGmq6SXgfBJwy1Lm31qlkSRJknoKOO8DnrqU+ZtQBn6XJEmSego4LwAOiYiNmzMi4inAIVUaSZIkqadhkY4BdgHmRsRXgGuq6TOBA4BHgPf1JXeSJEma9HoZFunGiHgBcApwWGP2RcBhmZn9yJwkSZImv56epZ6Z1wA7VI+t3KSafFNm3tm3nEmSJGlK6Cng7MjMPwN/7lNeJEmSNAX1FHBGxIrAzpTazenAtEaShZn5wWXMmyRJkqaAMQecEbEN8C3gH3hsoNmxEDDglCRJUk81nKcCqwJ7AL/IzHv6mSFJkiRNLb0EnM8C3puZ3+93ZiRJkjT19DLw+20s+Va6JEmS9Ci9BJwnAgdHxJr9zowkSZKmnl5uqa8D/BX4XUR8E/hfytOF6hZm5knLmjlJkiRNfr0EnCfW/n/zEtIsBAw4JUmS1FPA+dR+rTwingYcATyP8iz26zNzZiPNacAbunx8r8z8ZiPtEcChwEaUZ7wfmZnnN9KsQQmG9wRWAS6kPI7zlj5skiRJkhp6eZb6rX1c/zOA3YDLKO1Jl9Sm9PfAaxvTbqi/qYLNE4CjgCuBg4FzIuK5mXlVLenXga0pz4G/DzgOOD8inpmZDyzb5kiSJKmp50dbRsRmwA7ABsAZmXlLRKxMqV38Y2Y+NIrFfD8zv1st7zRgmyWkezAzL11KXoaAo4GTM3N2Ne0i4GrgvcDe1bTnUgLc3TLz3Gra1cBNwAGUMUYlSZLUR2PupR4RK0TEF4Drgc9Tagg3qWavTAnyDhvNsjJzwVjXvwTbAWsBZ9aW/QjwDWCXiOgM47QrcC9wXi3dPODiap4kSZL6rJdhkY4C3gi8D3g+tTE5M/MvlMdevqovuVts04i4JyIejojfRMQ+jflbVK/XNaZfA6wOPLGW7vouge41wOZ9zbEkSZKA3gLOA4EvZ+YJwO+6zL8a2GyZcvVov6F0LNqD0tHnNuDMiDiglmY6MJyZDzY+e3f1uk4t3T1d1nF3LY0kSZL6qJc2nP8AXL6U+Q8Ca/SWncfKzE82Jn03Ii4APgCc1q/19GLu3LmtLn/VNddn/vz5ra5jvNx552rcdvMdPX9+zpw5fczN5GU5FJZDYTkUlkNhOSxmWRQTqRx6CTj/CDx5KfNnAf3syd7N2cCpEbF+Zt5BqaEciohVMvNvtXTTq9e7qte7gY27LG96Lc2ozZw5k6GhobF+bNSuvXEeM2bMaG3542nddddjw826Ff3I5syZw6xZs/qco8nHcigsh8JyKCyHwnJYzLIo6uUwPDzceiXZSHq5pf4t4N+qXuodCwEiYhdgf0pnnfHUabu5RWP6lsD9wB9q6aLWiaie7vr2sidJkrT86iXgPBaYR2lbeQYl2DwqIi4FfgBcBXy4XxlsqoLFvYFbq9pNgEsovc/3qaVbsUp3XmYurCafC6wN7FxL9yRg+2qeJEmS+qyXgd/vi4jtgP8A9gL+RgnYbqIEoyc1bmsvUUSsxuLhiJ4MrBkRe1bvr6heT6cM1v47SrB4EGX8z9fX8jQcEccDJ0TEHZSB3w8CNgX2q6W7LCLOAb4UEYezeOD3eQy4PagkSdJU1dPA71VAeUL1tyw2oLTHrOu8PxD4HqXm8ugq7cOUYHL3zPx+I0+zIwLgrcCGlKGOdms8ZQhgX2A2ZZD3IcqjLffyKUOSJEnt6PlJQ/1QPb+82Z6y6ZVjWN5sSjC5tDT3A4dUf5IkSWrZmAPOiPjyKJItzMw39ZAfSZIkTTG91HC+mKpXes2KwBOq1zuAvy5jviRJkjRF9NJp6CndpkfE4yi3qd8O7LRMuZIkSdKU0cuwSF1l5sOZ+Rngx8Bn+rVcSZIkTW59CzhrrgL+qYXlSpIkaRJqI+DcCXCIIUmSJAG99VI/Zgmz1qbUbG4NnLgMeZIkSdIU0ksv9WOXMP1uytOG3gx8sdcMSZIkaWrppZd6G7fhJUmSNEUZPEqSJKlVvbTh3LiXFWXmvF4+J0mSpMmtlzact/DYJw2Nxoo9fEaSJEmTXC8B50HAW4EnAV8DbqimB7AvMA/4FLCgHxmUJEnS5NZLwPkEYAh4WmbeXZ8REe8HLgY2yswP9yF/kiRJmuR66TT0ZuALzWATIDPvpAyJ9G/LmjFJkiRNDb0EnOsCqy9l/uOrNJIkSVJPAeelwNsiYlZzRkRsA7wNuGxZMyZJkqSpoZc2nG8BfgZcHhFXADdW0zcDtgXuAg7rS+4kSZI06Y25hjMzrwWeSemJvjawZ/W3NvBJ4JmZeU3/sihJkqTJrJcaTjLzduAd1Z8kSZK0RD0FnB0RsRmwATA3M+/tT5YkSZI0lfQUcEbEfsCJwBOrSTsBF0TEesAlwNGZ+Y3+ZFFTwYIFC7n9rgd6+uyqa67f82fbsNrQSqzx+JUHnQ1JkiaNXp6l/mrgv4CfACcDszvzMvPPEXEdsD9gwKlFhh9+hEt+O7+nz86fP58ZMyZOwLnjthsbcEqSNAa9DIv0XuCnmbkzcHqX+ZcBWy1TriRJkjRl9BJwbgF8eynz/wSs31t2JEmSNNX0EnD+laU/aWhT4M+9ZUeSJElTTS8B5wXAARHxmEZsETEDOBj40bJmTJIkSVNDr204nwD8Gvh3YCGwa0ScCFwNLAA+0LccSpIkaVLr5UlDNwIvAP4IHAtMA/4DeBfwP8D2mTmvf1mUJEnSZDamYZEiYkXK2Ju3Z+ZLI2I68DRK4Pr7zLyjhTxKkiRpEhvrOJwrADcBRwIfz8y7gSv6nitJkiRNGWO6pZ6ZDwPzKe02JUmSpBH10mnoK5Re6qv0OzOSJEmaenp5lvoNwIrA9RFxOvB74MFmIp+lLkmSJOgt4Pyv2v/vW0KahfgsdUmSJDHKgDMiPgWcnplzgBdVk1en1Gw+0uvKI+JpwBHA84CZwPWZObNLul2ADwFbAn8ATs7MT3dJdwRwKLARcA1wZGae30izBnASsCewCnAhcFhm3tLrdkiSJGnJRlvD+RbgUmBOZl4UEetSnpm+U2ZetAzrfwawG3AZpT3pY9qURsTzge8BXwUOp4wBenJEPJyZn6ulOwI4ATgKuJLyxKNzIuK5mXlVbZFfB7YGDgPuA44Dzo+IZ2bmA8uwLZIkSeqil1vqHdP6sP7vZ+Z3ASLiNGCbLmmOAa7MzDdV7y+MiI2B90fEFzJzQUQMAUdTaj5nV8u7iPLko/cCe1fTnksJcHfLzHOraVdThno6ADi1D9skSZKkml56qfdNZi5Y2vwqkHwxcFZj1tcot823rt5vB6wFnFlb9iOUdqS7REQnON4VuBc4r5ZuHnBxNU+SJEl9NtCAcxQ2BVYGrm1Mv6Z63bx63aJ6va5LutUpT0fqpLu+S6B7TW1ZkiRJ6qOx3FLfJCL+sfp/rep184j4S7fEmXn5MuWsmF693tOYfnf1uk4t3XBmNodnqqe7rUrXXFYn3Tpdpi/V3Llzx/qRMVl1zfWZP39+q+sYL8OxzjJty0QqhzvvXI3bbh7MU1znzJkzkPVONJZDYTkUlkNhOSxmWRQTqRzGEnB+oPqre0xPcUrbzoWUsTqntJkzZzI0NNTa8q+9cR4zZsxobfnjaWholZ63Zf78+ROqHNZddz023GzjcV/vnDlzmDVr1rivd6KxHArLobAcCsthMcuiqJfD8PBw65VkIxltwHlgq7lYsk4N5dqN6Z2az7tq6YYiYpXM/NsI6bpFCtNraSRJktRHowo4M/P0tjOyBDcBD1HaXp5Xm75l9Xp99dppu7kF8JtGuvspY3d20u0UEdMyc2Ej3fVIkiSp7yZ0p6HMHAYuoBrWqGZf4I+U8TYBLqH0Pt+nkyAiVqw+d14tuDyXUlu6cy3dk4Dtq3mSJEnqs2UZh3OZRcRqLB6O6MnAmhGxZ/X+isy8lTIw+88j4ovAGZSB3w8GDu30Ns/M4Yg4HjghIu6gBKIHUXq579dZX2ZeFhHnAF+KiMNZPPD7POC0VjdWkiRpOTXQgBPYADi7Ma3z/kDgtMz8VUS8kvIUof2B+cA76k8ZAsjM2REB8FZgQ8pQR7s1njIEpXZ0NmWQ9yHKoy338ilDkiRJ7RhowFk9v3zEJxZVTwUa8ZZ39ZSh2SOkuR84pPqTJElSyyZ0G05JkiRNfgackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVSsNOgPSZLNgwUJuv+uBcV/vqmuu39f1rja0Ems8fuW+LU+SpCWZ8AFnRBwAfKXLrFMy8y21dLsAHwK2BP4AnJyZn+6yvCOAQ4GNgGuAIzPz/Bayrilq+OFHuOS388d9vfPnz2fGjP4FnDtuu7EBpyRpXEymW+ovA55f+5vdmRERzwe+B/wG2IUSoJ4cEW+uL6AKNk8ATgF2A24EzomIrcZjAyRJkpZHE76Gs2ZOZv55CfOOAa7MzDdV7y+MiI2B90fEFzJzQUQMAUdTaj5nA0TERcDVwHuBvVvOvyRJ0nJpMtVwdlUFki8GzmrM+hrltvnW1fvtgLWAMzsJMvMR4BvALhExrf3cSpIkLX8mUw3n3IhYH5gHnAZ8KDP/DmwKrAxc20h/TfW6OfBrYIvq/XVd0q0OPBG4rf/ZliRJWr5NhoDz/4D3A5cDj1DaaL4PeCpwADC9SndP43N3V6/rVK/TgeHMfHAp6cYUcM6dO3csycds1TXXZ/788e+c0obhWGeZtmUilcOybsuy6Od677xzNW67+Y6+LW88zZkzZ9BZmBAsh8JyKCyHxSyLYiKVw4QPODPzR8CPapN+EhH3AsdGxAcHlC0AZs6cydDQUGvLv/bGecyYMaO15Y+noaFVet6W0jt74pTDsmzLsuh3Oay77npsuNnGfVveeJkzZw6zZs0adDYGznIoLIfCcljMsijq5TA8PNx6JdlIJmsbzm9Ur1uzuIZy7UaaTs3nXdXr3cBQRKwyQjpJkiT10WQNOOtuAh5icRvNji2r1+ur107bzW7p7qeM3SlJkqQ+m6wB52uAhZShkoaBC3jssEb7An8ErqzeXwLcC+zTSRARK1afOy8zF7adaUmSpOXRhG/DGRE/ogSUc4EFlE5D/w58KTN/XyU7Dvh5RHwROAN4AXAwcGhmLgDIzOGIOB44ISLuoASiB1F6ue83jpskSZK0XJnwASflVvgbgX+g5PdG4Ejg5E6CzPxVRLyS8hSh/YH5wDsy83P1BWXm7IgAeCuwIWVIpN0y86r2N0OSJGn5NOEDzsx8O/D2UaQ7Fzh3FOlmU3sspiRJkto1WdtwSpIkaZIw4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktWqlQWdA0mAsWLCQ2+96YNDZGLNV11z/MflebWgl1nj8ygPKkSRpJAac0nJq+OFHuOS38wedjTGbP38+M2Y8OuDccduNDTglaQLzlrokSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVU+S13SpLdgwUJuv+uBkRNOAqsNreRz4SVNOQackia94Ycf4ZLfzh90Nvpix203NuCUNOV4S12SJEmtMuCUJElSq7ylLkkTyGjbo6665voTut2qbVEl1S13AWdEbAZ8GtgeeBA4EzgyMyfulVvScmO07VHnz5/PjBkT97JlW1RJdctVwBkRawMXArcCewIbAB8H1gdeM7icSZIkTV3LVcAJHAJMB56dmX8GiIi/A2dExAcz85qB5k6SpojxGqpqPJoW2DxAWnbLW8C5K3B+J9isfAv4MrALYMApSX0wXkNVjUfTApsHSMtueQs4t6AEl4tk5nBE3ARsPoblrAjw0EMP9TFrj7VgwSOstMKCVtcxXh75+8M9b8sqj5s2ocphWbZlWfS7HAa1HcuqWzlM1m3pZrTbMtHOi6bx2ifjUQ5/f/ghhodXbHUd/TA8PDzoLEwYlkXRKYdavDKwA3nawoULB7XucRcRDwPvy8wTG9N/CfwpM181muXMmTNne+AXLWRRkiSpLS+cNWvWLwex4uWthrNfrgBeCPwf8MiA8yJJkrQ0KwJPoMQvA7G8BZx3A2t3mT4duH60C5k1a9YwMJBfCJIkST24aZArX96eNHQdpR3nIhExBGzKGAJOSZIkjd7yFnCeC+wYEevWpv0LMFTNkyRJUp8tb52G1gbmArcAH2TxwO/nZ6YDv0uSJLVguarhzMx7gBcDfwH+G/gEcBbwxgFmS5IkaUpbrmo4JUmSNP6WqxpOSZIkjT8DTkmSJLXKgFOSJEmtWt4Gfp/wImIz4NPA9sCDwJnAkZn5wEAzNoKI2At4LTALWIcywOxngc9n5oJaul2ADwFbAn8ATs7MT3dZ3hHAocBGwDWUMji/kWYN4CRgT2AV4ELgsMy8pZFuYGUaEatTxnh9IrBtZv66Nm9/4CjgKZTyOi4zz2p8/nHAccAbKA8tuAJ4W2b+TyPdRsAngZcBC4EfAG/PzD830v0jZWSGWcBdwH9W623liVkR8Xrg7ZT9/QBwJbBvJ1/Lw/EQEXtQ9vMWwF+Bi4F3Z+aNjXRT5niIiKcBRwDPA2YC12fmzC7pJuz+H23elqUcImJF4HBgt2o9KwFXAx9obt9ULocu6WcBlwMPZubqjXkDOQdGc36OZAznxSrAu4HXA/8A/Bk4NzMPbqSbVMeDNZwTSDVs04XAGpSD43BgX+DLA8zWaB0ODAPvBF4OfAf4FPCRToKIeD7wPeA3wC7AV4CTI+LN9QVVJ9EJwCmUC/GNwDkRsVVjnV8HdgcOA/YBZgDnR8RqtWWtzWDL9Fi6/LCLiD2B04FvU8rip8DXq5O57hOUC8r7gVcCD1G2cUZtWSsB5wHPBPYHDgK2A74XEdNq6Tap1nMXZR+dQNlfH+rDdj5GRLyX8qPjvynb+CbKRXGomj/lj4eI2JGy/dcDr6rytjnw04hYs5Zuqh0Pz6Dsq98B13ZLMJH3/2jzNgojlcOqlCDmf4ADgddQvsR/EhEvb+RpKpdDfZ0rUK4bdywhybifA2M4P0cymvNiBcr35/5Vfl4KvIsyuk493aQ7HqzhnFgOoTxm89m1GqC/A2dExAcz85qB5m7pXpGZ9QvEhVXt3lsi4ujMHAaOAa7MzDfV0mwMvD8ivpCZC6onPx1N+dU0GyAiLqL86n8vsHc17bmUk2y3zDy3mnY15ZfnAcCp1ToGVqYRMRN4M/AfwOcbsz8InJ2Z76neXxgRWwAfAH5Yff6J1effmplfrKZdCtxMqTV8V/XZVwNbATM72xMR8yk1abuw+KEG7wTuAfaq9sf5EbEWcExEfDQz7+rjtgcl2P6XzPxBbdZ3av8vD8fDvsCtwBsyc2G1vluBy4AXUO1rpt7x8P3M/G617tOAbbqkmcj7f8S89akcHgSempl3dyZExI+Bp1O+9H9QTZvq5VB3MLAWJdh5a33GAM+BEc/PPpbDgcDzgS0z8w+16WfUymFSHg/WcE4su1IGoa9X+X+LUnM41l9S46oRbHb8hlKFv051gryYMu5p3dcotwO2rt5vR7nYnFlb9iPAN4Bdar9OdwXupfyK7aSbR7mg7Fpb/iDL9BTgM8AN9YkR8VRKLdeZjfRfA7aNiPWr9y8FVqRWZpl5P+VLqLmNV9eDpcy8hBLoNNN9p7qw1tfZ2Tf9dCBwayPYXGQ5Oh4eB9zfCTYr91Sv02BqHg8jffFM5P0/hryNaKRyyMxH6sFmNW0hpcZzRm3ylC6HjohYj1Jr9zZKzWXTuJ8DYzg/RzTKcjiYEtz+YSlpJuXxYMA5sWxBo5q9OhFuohzwk80LKbcq/kR5Xv3KPPY2QueC0Nm+zrPur+uSbnVKW8hOuuu7nMDX8OiyGkiZRmm7+DTg+C6zO9u4pLKIWrrbM/POLumeXt166aTrdntmUVlExOOBjZvpqnY8D9D/snge8NuIODoi/hgRD0fE5RHxz9X85eV4OA3YIiIOi4i1I+IpwGzK9nTaWi0Px0PTRN7/o81bK6r9uB2P3ublpRw+AvwyM89bwvxBnAOjPT+XWZT2qVsDt0TE6RHxl4j4a0R8p6pJ7JiUx4MB58QyncW1H3V3UzriTBoRsQ2llusT1S+v6dWsexpJO7/uO9s3HRjOzAdHka65rE66elmNe5lWt2ROAt6VmX/pkmQsZdFM00n3OMqFZaR0nWWtvYR1NtP1y0bATpRj4K3AK4D7gPOqoGu5OB4y80JK280PVeu4GXgqsFOtVmV5OB6aJvL+H23e2nIYJYj5WG3alC+Hqn3gvsA7lpJsEOfAeJbDupTtOJJyDX01pe37VsC5UdqmdvI06Y4HA071XZTegd+i9DL8yAjJp6LjgRsz84wRU05dK1Au/q/OzG9UNRa7U4LOdw40Z+MoIrYDvgp8iXI7ai9gAaXzwqqDzJsmnuoOwEeB2Zn5i0HnZ7xE6a1/KvDxzPz9oPMzQJ2Y7C/AHpn5o8w8k3LdeAbwLwPLWR8YcE4sd7P4V1fddMqt6Qmvqt37IeWWxO6Z+XA1q/MraO3GRzq/mu6qpRuKMizESOmay+qkq5fVuJZpRDyD0qj9fdUt1LVZ/Kt79ShDVIylLJppOukeZnGvxdFs4z1LWGczXb/cDdyZtaFKsgyvcSllOJDl4nigjNRwYWa+IzMvzMxvUhrxP4cy5EknT3TJ11Q6Hpom8v4fbd76KiKeBXyX0rHuyMbsqV4OBwNPAE6tXTdXgdKDuvbjbBDnwHiWwz2UIZwurtdeZhlO7z7KtbOTp0l3PBhwTizXsbhtBrCooe6mlGFVJrTq4P8esAHwskY7m5sojcC3aHxsy+q1s32dNind0t1PGTKkky5qjaPr6eplNd5luhll9IcLKSfo3cD3q3kXAr9g6dsIkNXrdcAGEdG8VbElcEOtXc5jtrGW7nqAzPwrMK+ZLiKeDKxG/8tiab29V2H5OR62pHQAWSQzb6OMq7dpLU8088XUOh6aJvL+H23e+iYiNgV+RBmn9vWNTmYw9cthc2BDynZ0rptHAo+v/v9wLd/jfQ6M9vxcZtWP8luWMHshVRA+Qp4m7PFgwDmxnAvsGBHr1qb9C6XH3LndPzIxVG1LvgE8C9glM2+tz6/aq11ANVxDzb7AHykXWoBLKL3q9qkte8Xqc+fVLsTnUn5p7VxL9yTKoLX1shrvMv0l8KLGX6dN0puBgzLzZsrJuU/js/sCV9R6/P+Ycvt1UZlFGWrqFTx2G59ZDdPRSfc8ygDFzXR7RMTKjXUOs7gDS7/8AFg3Ihb1XKwa6T8fmLMcHQ+3UgaUXqT6QluP6otlOTkeHmUi7/8x5K0vqiZIP66WvUdmduudPdXL4TM89rp5OvC36v/PVOnG/RwYw/nZLz8Atq83uYkyOP1awJxq0qQ8HqYtXNj8IaVBqW4jzKV8EX2QUlP4ccpwBa8ZXM5GFhGfB/6VMg5as+3RtZl5X9Uo/OeUnrtnUMYhPA44NDM/V1tWZ0Db91AO4oMojaefm5lX1dL9gHJr8nDK7YbjKNX7z6x+KU6IMo2IHSi1m4ueNBTlyUxnUX65/4QygPHbKOOl/bD22c9Qbr0eTglejqCM3fbMzJxfpVkJ+DWlsfl7KDWsJwG3Ay/IxeM/bkKpbbuA8kSJqNJ9OjPf3edtXgH4FbA+ZVy4+6tt2JYy1tvvlofjISLeQinrz1Bul65LGT9vfeAZnbsAU+14iDKodGfYlUMpNSX/Ub2/IjNvncj7f7R5W9ZyoIzg8atq+uso+2iRzLx0eSiHZgVF9ZljgSPysU8aGvdzYLTnZz/KoQoIr6Ls409QAsYTKPty604ztcl4PFjDOYFk5j2UjgV/oTyd5BOUg/yNA8zWaHV+QX2UcgGt/20NkJm/opyo21JuHx0EvKN5oGYZyPYoSu/mH1Jut+xWP4kq+1J+DZ4KnE35lfWSrD2Ka6KWaWaeTenBvSelLHYG9uty8XoH5akbx1OaK6xK2cb5tWX9nfL4trnAf1Ge/nAppQ3twlq63wMvoQQ751ACn49RAsJ+b98CSlvFn7N4/wDskJm/q9IsD8fDKZRBlV9IaZt3MuUpIy+qNzmZgsfDBpR9cDawA/Ck2vsXVeufsPt/tHnrQzlsSOmBvDrl+GheO5eXchiLcT8HxnB+jmQ058X/Vv9Pq6Z/hlKJ85Jc3CdiUh4P1nBKkiSpVdZwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVv1/xVCuu0pwNnUAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoIAAAFKCAYAAACJoz5RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAw2ElEQVR4nO3debxtc/348deNXMksEpUy9Ea3+HWpDPVrki6l4WuIypeifJMkU5Mh5Kf4lpKiCRWRVF8iDcZKGU4D1/CmK3y5kXmI7sW9vz8+a3eWbZ9pn2Gfe9br+Xicx7p7rc9a+7M++3PPfp/PtKYtXLgQSZIkNc8zep0BSZIk9YaBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISuMgIg6NCNdm0tNExOsiYmFEvK7XeZmsIuLiiLi41/noJCL2jYibI2LxXudlKBGxdUQ8EhEr9zovmrwmfUWWJosRBHa7jmtGRiEipgHvA3YDNgCeCcwBzgC+mJmP9jB7wxYRzwb2AbYH1gQeB+4Afgd8KTNv6GH2JoWIWAPYH9gSeD7wJHA98FPg+Mx8oGeZG0cR8WHg0cw8eRyuvQzwSeCgzHxirK8/1jLz3IiYQ8nzx3udH01OBoLS8L2v7fUHgVcD72/bfxnwfeCoicjUcEXEYsBplODpN8DBwGPAa4HPAttHxJsy8x+9y+XQIuKZwCXADOB7wNeAZwHrAlsDvwcaHQhGxJbAWZTg73vA1ZTf9xtRgoL/C7y5ZxkcXx8G7gFOHodrv59S1747DtceLycCR0fEoZn5UK8zo8nHQFAapsz8fv11RLwJeGX7/prJ1mJwACUIPCYz96/t/0ZE/BD4CXASJZiaMFUr5ZKZ+dgwT3kHMBPYtb3Vp+quW25MM7iIiYgXAWdSWkjfkJl3tB3/FKVFWCP3fuC8zPxnrzMyAj8CvkL5v/+tHudFk5CBoDQOIuJQ4JDMnFbbdwulpeoo4BjgpZRu2Y9m5oUR8Q7gMOAlwHXA7pnZ13bdlwBHAG8Enk3p6vtcZv5oiPw8i9JNeCOlRegpMvPsiDgF2DUiXpmZV9TyfHFm7tJ2vYur815X2zcd+ATwXuCFlFaZHwKfrnc5V13sJwIXAZ8BAvhgROwOLJOZL++Q/z8Cj2fmq4C1qt2/6XAfTwD31s5bgxIAvwFYA5gP/Bb4ZGZeU0v3uio/7wHWobT2Lgf8EvgA8E/K5/YeSrmfBexRD17b7usQSpd1Agdm5vntee1wj0N+tlWg+wlgZ+AFlBbdm4DPZ+aPq2QHAMsAW7UHgVUZ3Vm9T/299wA+AqwNPAD8T1VG99XSXAysCmwHfBXYGLizSndGRGxOqdcbALdR6vUvaucfWpXLSymf+9bAAkod2TczHxmifKZVefwg5TN6CDiHUr73VGluoXzO9aEct2bmi6p9w6qjA7z/i4GXA8e37X8R8Dc6/2GyEPhsZh5avV4aOBT4D2C16h6uBQ7OzEtr521MaaXfDFgC6KN0R1/Udv3nVdfbGlgZ+DvwK+DjmfkwQGb+IyKuBt6JgaA6cLKINLHWBH4AnEv5QloeODsidqL81X4apct2TeDMqjsXgIhYD7gceBnwBWBfStBzZkS8d4j33RxYAThtkLFNre6ut430pqov6Z9QgpBzgb0oX7AfBn5aHa97LeUL9Szgo5QA+RTgZRHxlECwuu//U8vfLdV25w7Xbbdx9V4/AvYGvgS8Arik+hJtdwAwixL0fZvS+vgN4JvA+pQv57MogdjTAmrKF/fXq3v/NLAkcE4VJA1oBJ/tIZQ/Fi6hlNthlLJ7ZS3NNsDfMvO3g71n7b0/U+X5LsofC6dTWr4urAKnuuUon++VlLJ6FDg1It5NKePzKfV6qSrvnVpnT6fUxU9V53yQUl5D+TrwRUo57U35XLYFLoqIJas0HwNup5TJ+6qfj1X3OdI62m7TanvVMPI62D3sVeXjw8DngbspwTNVPv8v5Y+cFSmf74HAdOCX9QlGEbEqcAWlLp5VXfdkSl1Yqe19+4BNhnGPaiBbBKWJtQ7w2sz8DUBEXA/8AvgOsF5m/q3a/wClden1wK+rc78MzAU2qrVEHR8RvwSOiohTM3OgCS3rV9u/DJK31rH1B0kzkB2BtwCvz8xLWjsj4irKeMktKK1rLesCr8jMP9fS3kC5x/dSvqxb3keZDHJG9fqnlC/6gyktmBdTWvnO7dACdm57a2lEfI/S4voB2lrGKK0vr8zM+VXalYF3U1pZ3lKV79ciYh1KsHRw2/kzgE0z8/fV+SdTWuyOogTjAxnuZ/tWStfk7p0uEhHLAqtTWvSGVN3fQcAFwJaZ+WS1/8+UYQK7U1r/WlYFds7M71XpfkX5LE4DXpOZv6v2t+r1djy9FeoOSmvlwirt34GDqvGpv6aDiNgU+BDwn5n53dr+8ylB087ANzLzpxFxBHBPhyEbI62j7dattjcPkmYobwW+mZkdJ25UgdqJlPq8Ra2MTgD+BBxJf0B6FKVVcdPMvLx2mUM7BHw3U4Lv51HqmfRvtghKE+vGVhBYaf0Cv7gVBLbtXxMgIlYE3kRpwXh2RDyn9UNphVmd0qU8kGWq7cODpGkdW2aQNAPZntLtfG1b3i4BFlIC2rrL6kEgQDWL9Wxgp4h4Bvz7i3En4Oet7r/M/BclqDoGaM2CPhH434j4fhUMta5Z77pdKiJWonTHJWWcYbvvtYLAyuXVe5zUFmRfDqzWocXsqlYQWL3/vZQgabOIWKHD+430s30QeGnVjdxJ694H+5zr3kQJfr/cCgIr36O0ELaPF30MOLV2f0npSr6xFQRWnlJ/23y1rSy/Um3fOkg+twceAc5vK58bqny216+BrjGSOtpuJUpX9oPDeK+BPAi8KiJWH+D4BpShEqcBK9XyuCzlj5FXVfX4GZSu3p+3BYEAdPiD8P5q+5xR5F1TlC2C0sS6rf4iMx+MCID/bUvX+rJpBQ9rUwKSQ6ufTlahBDidDCfIax3rZtbwSyhfYHcPcHyVttdzBkh3CuUL+/WUVqrXUMZ87VdPVAVY+wP7R8TzKd2/e1PG8C2gtBBRdRkeRmllbO8Kvpenu63tdetz6PT5TKN07d9V239Th2veWG3XoP8LuW4kn+3BlBbRjIjrKK1up2Vmq7uyNSt0uMH8GtX2KfUmM5+MiJuAF7WlvyMzF7Tte5C28qnV607B701tae+JiPs7vFfdS4CleWpZ17XXr4GuMZI62sm06qfbNUL3p9Tx2yLiT5RA/3tVQN3KI5RhCQNZiTLWdVlg9jDft9VC6NqmehoDQWliPTnC/a1f4K3W+y8B5w2QdrAvheuq7cspgUQnrbF59a6vgb44FuOpeX5G9R57D5C+vTtqoBnCv6B82b+XEgi+l9LidM4A6cnM24HTIuJHlIH3746I91djIY+jdOEeR1nW5wFKoHgsnXtEuv18RmPYn21mXhoRa1HGcb6ZEvB+LCI+kZlfyMyHImIuZazheOhF+UApo3sp3fSddAqwO11jJHW03T2U+1mu7f06/h+pj+9tycwzI+I3wNspn99HgQMiYpfMPI3+uvAJyri+Tu5m5DPjWwH5PSM8Tw1gICgtGlrB2RMDjaMawu8oQdBOEfG5tm7Alp2r7Zm1ffdTWr3arcFTA8Y5lK7WCwYZpzikqiXqVGD3iNiHMhngzMycN4xz50fEXygtbM+hzGjdDvhuZn6snrbqph2PL8V1OuxrtfLcOsA5I/psM/N+ysSZ70aZDX4e8NmI+O/qcz0b2CMiNmvrru2klaegv+WSqutxHcq4tLG2Ttt7PYcSqNwyyDlzKGP4/jDU7GIG/uNltHX0+mr7Yp4aCLb+vXxb+jXooJq1fSJwYkQsD/yBMgnpNPpbyh8erC5ExHxK6++MYea9lec7h5leDeIYQWkRUC3yfBElQHra+KIY4hFS1dIYX6B84X+uw/lbA7sAZ9eXVaF8Mb06IpaopX0rZemSujOA5wL/1eHa06M8kWG4TqF0bZ5ICRCesnhvRGzQ6X6rL9VNgPvo7/57krZWqYjYkTLIfjxsFBGb1N5rJcoYx8uqAO5pRvLZVtern/sYZZzckpSFjgGOpoyn+3ZEPO0+I+K51UxhKOPO5gMfbY3LrLyH8nn+bPDb7cpH2iYzfLTanjvIOWdQvq/aJ+cQEYu1jb/8J527pEdbR1tB9Ub1ndUizfdQhifUfbhDPp/SkleNi/0b/UFkH/BX4OOd8tOqC1X3/E+AWRHxqg7p2ltiZ1KCaLuG9TS2CEqLjv+ifBldHRHfpARpqwCvosz0XXuI878AbAgcGBGvBn4MtCZevAe4hhIM1n2L0ip3fpRFp9eidNe2j/H7fpXu+CjLX/yWEoAFZczfdsDFw7nJzLy6atnbnvIl2d6qtQVweEScQ3mKyIOUwHRnSoC3V63F82zKMjMPUbpXNwR2YHQzPwczG/hZRBxHGZf5QUpQ22mpmbrhfrbXR8SllOVb7qFMLtgN+FmrpSwzb46IHSgtu9dVs6RbTxZ5BaV79bIq7T0RcThwOGV5kp9SJnh8hDKLfDzWnVsdOC8iflblf3fgl5n5q4FOqLrEj6eMCX05ZQjBPEq5bEsJEE+ukl8FfDgiDqG0PD6SmecwyjqambdVs6m3oCxdU/ct4BMR8a3q/V/L0ydvLQPcERFnUcr2IcpyQ2+hmpmdmQsi4gOUsYPXRcR3KMvhrEZ5Gsw0+ie1fLLKy8URcSKl2/u5wLsoE0luAYiIVSjDPk4Y6N7UbLYISouIakD5RlTBDWUdvg9TvuAPGsb5T1KCgF2qc46gtLq9j9Iy9Mr2VqtqQeB9KV9qx1Ja3N5K+XKqp1tA+QLanxK4HE3p7no15RFwV4/wdk+ptt/v0IpxFmXpjNUpa6ydSGlVuhl4Z2bWlzvZmzLwfgfKGLyXUr542yd/jJXfUT6THShLfcwD3lFfLLiTEXy2x1KeG3wgJXh4C6Usdmy73nmUbsPTgK0oYySPpgTCn6MEPa20R1AC0ecB/035o+Bk4I3D6ZLvwo6Ubsoj6X/axXaDnlHy+RHKkj8rUu7hKMo4ux8CF9aSHkYZU/pxyv0fV50/FnX0O8BWUZ51XXcYpZ5tS/mDazHKepR1j1I+15dRPtNjKZ/5fpTnZrfu89IqT3+g1IGvUsa53kdZd7CV7u+UPxROp5RpazzsFTx12MN/UFp9z0DqYNrChbYUS00V5bm951CevLH1YK0yEyki9qR8AUZm3jhU+skgqieLZOYevc7LZBT9TxZ5XjVObpFTddfeTHkSyNd7nZ/hqFoxL24fJyu12CIoNVhmPk5pMfgLcFZEvKLHWWrZDfj9ohIEqhmqx7YdReminvRDq6qxv2tTWl+ljiZ9RZY0vjLzn5RHsfVU1d22DWUs1IaUbjZpUsnM/6Z0oU96mXkuZf1FaUAGgpImi5UpY7oeAL6QmWf1NjuSNPU5RlCSJKmhbBHsQl9f33RKV9rfGXhFfUmSpMlgMcrKAFfOnDnzKasBGAh2Z2PgN73OhCRJ0gi8hrKG5r8ZCHbn7wAveclLWGKJJQZNOHv2bGbMGO5TgKYuy6GwHArLoZ9lUVgOheVQWA79xqIs5s+fz4033ghV/FJnINidJwGWWGIJpk+fPmTi4aRpAsuhsBwKy6GfZVFYDoXlUFgO/cawLJ42nM11BCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIayieLTGIP/3M+j857otfZGBNLL/ecXmdBkiS1mTSBYEQsDdwArA5snJlX1Y7tDHwKeBEwBzgsM89oO/+ZwGHAfwLLA1cCe2fmn9vSrQp8GXgLsBD4GfCxzLxnPO5rNB6d9wQXXHlbr7MxJjZcc6leZ0GSJLWZTF3Dh9IhMI2IbYFTgJ8As4BfAz+IiFltSb8E7AkcArwdmA9cEBGr1a61OHA+8DJgZ2A3YFPg7IiYNsb3I0mSNKlNihbBiJgB7AF8HDix7fDhwJmZ+cnq9UURsR7wWeDn1fmrV+d/NDO/We37A/A34GPAAdW5/wFsAMzIzGurdHOB31GCzPPG4/4kSZImo8nSIng88FXgxvrOiHgxsC5welv604CNI2Ll6vWbgcWAf3cXZ+bDlG7frWrnbQVc0woCq3SXAbe2pZMkSZryeh4IRsT7gLWBIzocXq/aXte2vxXIRS3dXZl5b4d0L4mIZ9TStV+rlW7dkeRbkiRpUdfTruGIWA44Gtg3Mx+JiPYkK1TbB9r2319tV6yla0/TSvdMYGngoSHSrT/8nBezZ88eVrq+vr6RXhqAZy27MnPnzu3q3MlmwzXX7rocphrLobAc+lkWheVQWA6F5dBvPMui12MEjwBuysxTe5yPrsyYMYPp06cPmqavr4+ZM2d2df277nuU1VZ7tKtzJ6Nuy2EqGU19mEosh36WRWE5FJZDYTn0G4uymDdv3oCNVz0LBCPipZQJHltExPLV7qVb24hYhv6Wv+WBO2unt1oK76u291dp2q0APA48Mox093XYL0mSNGX1cozgOpRA9CJKgHY/cE517CLgN8D11ev12s5tdeNmtb0eWCUiVuyQ7sbMXFBL136tVroburgHSZKkRVYvA8HfAq9v+9mnOrYHsFtm/o0SoO3Qdu6OwJWZeXf1+pfAAmD7VoJqgeq38dQlYc4DXlYtP9NK92rKQtUuHSNJkhqlZ13D1ZM8Lq7vq00W6as9WeRg4IyImAP8irJY9JuBrWvXuiMiTgA+HxFPUJaD2Q+YBhxbe4uzgKuBH0XEJyn3fzTwe6o1CSVJkpqi58vHDCUzzwR2BbYFfgFsCeyUme2B2z7A1ykTUM4GngW8KTPn1q71BOXRcrOB7wMnAX8AtsnMheN8K5IkSZNKr2cNP0VmXkxpxWvffwrlMXODnfs48InqZ7B0d/L0rmZJkqTGmfQtgpIkSRofBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNtXiv3jgi3gV8HFgXWBq4A/gJcHhmPlhLNwv4HLB+lebYzDyuw/X2A/YEVgWuBQ7MzAva0iwDHA1sCywJXATslZm3jPX9SZIkTXa9bBFcEbgU+CDwFuDLwPuBM1sJImIT4GzgT8As4CTg2IjYo36hKgg8Ejge2Bq4CTg3IjZoe88fANsAewE7AKsBF0TEUmN9c5IkSZNdz1oEM/Nbbbsujoh/ASdGxGqZORc4GPhjZn6gSnNRRLwQOCQivpGZCyJiOvAZSkvhMQARcQlwDfBpYPtq36soQeLWmXlete8aYA6wC/C1cbxdSZKkSWeyjRG8p9ouUQV4bwDOaEtzGqX79xXV602B5YDTWwky80ngh8CsiJhW7d4KeBA4v5buNuB31TFJkqRG6VmLYEtELAY8E3gppQXw7My8JSLWB5YArms75dpquy5wFbBe9fr6DumWBlYHbq/S3ZCZCzqk23IMbkWSJGmR0vNAELiX0qIHpbVup+rfK1TbB9rS319tV6ylm5eZjw2S7vYqXfu1WulW7LB/SLNnzx5Wur6+vm4uz7OWXZm5c+d2de5ks+Gaa3ddDlON5VBYDv0si8JyKCyHwnLoN55lMRkCwdcBSwEzKGP9zomILXqao2GaMWMG06dPHzRNX18fM2fO7Or6d933KKut9mhX505G3ZbDVDKa+jCVWA79LIvCcigsh8Jy6DcWZTFv3rwBG696Hghm5p+rf14WEX2U7t530t8lvHzbKa2Wwvuq7f3A9IhYMjP/NUS6F3bIwgq1NJIkSY0x2SaL/BlYAKxNmc07n/4xgC3rV9sbqm1rbGCndA9T1h5spYva5JF6uhuQJElqmMkWCG5CydPNmTkPuJBq+ZeaHYE7gT9Wry+jzAbeoZWgmoCyPXB+Zi6sdp9HaV3cspbuBcDm1TFJkqRG6eWTRX4BXECZtfsvYENgf+Bq4KdVssOASyPim8CpwGbA7sCerdm/mTkvIo4AjoyIuykB4m7AWvRPPCEzL4+Ic4FvR8S+wEPV9W8DTh7Pe5UkSZqMejlG8ArgvcCLq9e3ACcAX8zM+QCZ+fuIeDvlqSE7A3OBfTLzhPqFMvOYiAD4KPBcSnC5dWb+pe09dwSOoSwePZ3yiLntMnPqzMiQJEkapl4+WeQg4KBhpDuPYXTdVk8VOWaINA8DH6p+JEmSGm2yjRGUJEnSBDEQlCRJaigDQUmSpIYyEJQkSWooA0FJkqSGMhCUJElqKANBSZKkhhpxIBgRW3Z4Xq8kSZIWMd20CP4cuD0ijo6IDcY6Q5IkSZoY3QSC7wB+B+wJ/DEiro6I/SJitTHNmSRJksbViAPBzDw7M7enPNN3d+Bu4Cjg1oj4ZUS8NyKWGuN8SpIkaYx1PVkkMx/OzO9k5huBNYBPAasApwB3RcR3I+KNY5RPSZIkjbGxmjW8GPBMYDowDXgMeBPwq4j4U0TMGKP3kSRJ0hhZvNsTI2I5YHvgvcBmwBPAucAnqu0CYBvgS8BJwMajzawkSZLGzogDwYh4ByX42wpYErgS2Bv4QWbe15b8pxHxHOBro8ynJEmSxlg3LYI/Bu4Avgyckpk3DJH+auDULt5HkiRJ46ibQPDNwAWZuXA4iTPzCuCKLt5HkiRJ42jEgWBm/no8MiJJkqSJ1c0j5r4UETcNcvzGiDh6dNmSJEnSeOtm+ZitgTMGOX4G8LbusiNJkqSJ0k0g+ALglkGO31qlkSRJ0iTWTSD4EPDiQY6vSVlQWpIkSZNYN4HghcCHIuKF7Qci4kXAh6o0kiRJmsS6WT7mYGAWMDsiTgKurfbPAHYBngQOGpPcSZIkadx0s3zMTRGxGXA8sFfb4UuAvTIzxyJzkiRJGj9dPWs4M68FXlc9Pm7NaveczLx3zHImSZKkcdVVINiSmfcA94xRXiRJkjSBugoEI2IxYEtKa+AKwLS2JAsz8/BR5k2SJEnjaMSBYERsBJwFPJ+nB4AtCwEDQUmSpEmsmxbBrwHPAt4B/CYzHxjLDEmSJGlidBMIvhz4dGaeM9aZkSRJ0sTpZkHp2xm4S1iSJEmLiG4CwaOA3SNi2bHOjCRJkiZON13DKwL/BP4aET8C/pfyNJG6hZl59GgzJ0mSpPHTTSB4VO3fewyQZiFgIChJkjSJdRMIvnjMcyFJkqQJ182zhm8dj4xIkiRpYnX9iLmIWAd4HbAKcGpm3hIRSwCrAndm5vyxyaIkSZLGQzdPFnkGcALwAcoyMguB3wO3AEsA1wCHAf89ZrmUJEnSmOtm+ZhPAe8HDgI2obamYGY+Qnn83LvGJHeSJEkaN90EgrsC38nMI4G/djh+DbDOqHIlSZKkcddNIPh84IpBjj8GLNNddiRJkjRRugkE7wTWGOT4TMCZxZIkSZNcN4HgWcB/VbOGWxYCRMQsYGfgh2OQN0mSJI2jbgLBQ4HbgD8Bp1KCwE9FxB+AnwF/Af7fWGVQkiRJ42PEgWBmPgRsChwJPBf4F7A5sDQlSHxtZj42hnmUJEnSOOhqQenM/BclEDxybLMjSZKkidJN17AkSZKmgG6eLPKdYSRbmJkf6CI/kiRJmiDddA2/gWqWcM1iwPOq7d3AP0eZL0mSJI2zEQeCmfmiTvsj4pnAh4CPAVsMdZ2I2A54D2XdwRWBOcDXgRMzc0Et3Szgc8D6wB3AsZl5XIfr7QfsCawKXAscmJkXtKVZBjga2BZYErgI2Cszbxkqv5IkSVPNmI0RzMzHM/OrwC+Brw7jlH2BecD+wFuBnwJfAT7fShARmwBnU5aqmQWcBBwbEXvUL1QFgUcCxwNbAzcB50bEBm3v+QNgG2AvYAdgNeCCiFhqJPcqSZI0FXQ1a3gIfwHeN4x0b8vMu2uvL4qIpYGPRMRnMnMecDDwx9p4w4si4oXAIRHxjcxcEBHTgc9QWgqPAYiISyjPPP40sH2171WUIHHrzDyv2ncNpSVyF+Bro7prSZKkRcx4zBreAnh0qERtQWDLnyhdtitWAd4bgDPa0pxG6f59RfV6U2A54PTatZ+kPN1kVkRMq3ZvBTwInF9Ldxvwu+qYJElSo3Qza/jgAQ4tD7yWEqAd1WV+XgPcB/wDCGAJ4Lq2NNdW23WBq4D1qtfXd0i3NLA6cHuV7ob6+MNaui27zK8kSdIiq5uu4UMH2H8/pZt1D+CbI71oRGwE7Ap8NjOfjIgVqkMPdHgfKBNMAFYA5nV4mkk93e1VuvZrtdKt2GG/JEnSlNbNrOEx706OiFWBs4ArqE0Wmexmz549rHR9fX1dXf9Zy67M3Llzuzp3stlwzbW7LoepxnIoLId+lkVhORSWQ2E59BvPshiPySIjEhHLAT+njCvcJjMfrw61WvSWbzul1VJ4Xy3d9IhYsnr03WDpXtghCyvU0ozIjBkzmD59+qBp+vr6mDlzZjeX5677HmW11YYcbrnI6LYcppLR1IepxHLoZ1kUlkNhORSWQ7+xKIt58+YN2HjVzRjBTsHUkKqJGe3XWpKyPMwqwKaZeW/t8BxgPmVs3/m1/etX2xuqbWts4HqUySb1dA9T1h5spdsiIqZl5sK2dDcgSZLUMN10894C/K2Ln6eIiMUpM3tfDszKzFvrx6vlYy6kWv6lZkfgTuCP1evLKLOBd6hde7HqvPNrQd95lNbFLWvpXgBsXh2TJElqlG66hncDPgq8gLKUy43V/qAEabdRFoZun53b7njgbcABwFIR8erasesy8yHgMODSiPgmcCqwGbA7sGdr9m9mzouII4AjI+JuSoC4G7AWsFPrgpl5eUScC3w7IvYFWte/DTi5i3KQJElapHUTCD4PmA6snZn31w9ExCGUdflWzcz/N8R1Wi1zX+hw7PXAxZn5+4h4O+WpITsDc4F9MvOEeuLMPCYioASoz6UsCbN1Zv6l7bo7AsdQFo+eTnnE3HaZOXUG4kmSJA1TN4HgHsAX24NAgMy8t2q92xsYNBAc6JnFHdKdxzC6bqunihwzRJqHKc9D/tBw3luSJGkq62aM4EqUhZoH8uwqjSRJkiaxbgLBPwB7R8TT5jJXi0LvDVw+2oxJkiRpfHXTNfwR4GLgioi4Erip2r8OsDFlTb69xiR3kiRJGjcjbhHMzOuAl1FmBi8PbFv9LA98GXhZZl470PmSJEmaHLp6skhm3gXsU/1IkiRpETSqR8xFxDqUp4LMzswHxyZLkiRJmgjdTBYhInaKiNsoj2a7FJhZ7X9ORNwYEe1PA5EkSdIkM+JAMCL+A/g+5dm9+wPTWscy855q/85jlUFJkiSNj25aBD8N/DoztwRO6XD8cmCDUeVKkiRJ466bQHA94CeDHP8HsHJ32ZEkSdJE6SYQ/CeDP1lkLeCe7rIjSZKkidJNIHghsEtELNF+ICJWA3YHfjHajEmSJGl8dTtG8HnAVcCHgYXAVhFxFHANsAD47JjlUJIkSeOimyeL3ARsBtwJHEqZNfxx4ADgz8DmmXnb2GVRkiRJ42FEC0pHxGLA6sBdmfnmiFgBWJsSUN6cmXePQx4lSZI0Dkb6ZJFnAHOAA4EvZub9wJVjnitJkiSNuxF1DWfm48BcyrhASZIkLcK6mSxyEmXW8JJjnRlJkiRNnJF2DQPcCCwG3BARpwA3A4+1J8rMH44yb5IkSRpH3QSC36/9+6AB0iwEDAQlSZImsWEFghHxFeCUzOwDXl/tXprSEvjkOOVNkiRJ42i4LYIfAf4A9GXmJRGxEuWZwltk5iXjljtJkiSNm24mi7RMG7NcSJIkacKNJhCUJEnSIsxAUJIkqaFGMmt4zYh4ZfXv5artuhHxSKfEmXnFqHImSZKkcTWSQPCz1U/dcR3STaMsH7NYt5mSJEnS+BtuILjruOZCkiRJE25YgWBmnjLeGZEkSdLEcrKIJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDXU4r1884hYG9gPeDUwA7ghM2d0SDcL+BywPnAHcGxmHtch3X7AnsCqwLXAgZl5QVuaZYCjgW2BJYGLgL0y85axuzNJkqTJr9ctgi8Ftgb+ClzXKUFEbAKcDfwJmAWcBBwbEXu0pdsPOBI4vrrmTcC5EbFB2yV/AGwD7AXsAKwGXBARS43RPUmSJC0SetoiCJyTmf8DEBEnAxt1SHMw8MfM/ED1+qKIeCFwSER8IzMXRMR04DOUlsJjqutdAlwDfBrYvtr3KkqQuHVmnlftuwaYA+wCfG1c7lKSJGkS6mmLYGYuGOx4FeC9ATij7dBplO7fV1SvNwWWA06vXftJ4IfArIiYVu3eCngQOL+W7jbgd9UxSZKkxuh11/BQ1gKW4OndxtdW23Wr7XrV9voO6ZYGVq+lu6FDAHpt7VqSJEmN0Ouu4aGsUG0faNt/f7VdsZZuXmY+Nki626t07ddqpVuxw/5BzZ49e1jp+vr6RnppAJ617MrMnTu3q3Mnmw3XXLvrcphqLIfCcuhnWRSWQ2E5FJZDv/Esi8keCE5qM2bMYPr06YOm6evrY+bMmV1d/677HmW11R7t6tzJqNtymEpGUx+mEsuhn2VRWA6F5VBYDv3GoizmzZs3YOPVZO8abrXoLd+2v9VSeF8t3fSIWHIY6dqv1Up3X4f9kiRJU9ZkDwTnAPPpHwPYsn61vaHatsYGdkr3MGXtwVa6qE0eqae7AUmSpAaZ1IFgZs4DLqRa/qVmR+BO4I/V68sos4F3aCWIiMWq887PzIXV7vMoLYJb1tK9ANi8OiZJktQYvX6yyFL0L9uyBrBsRGxbvb4yM28FDgMujYhvAqcCmwG7A3u2Zv9m5ryIOAI4MiLupgSIu1FmHe/Uer/MvDwizgW+HRH7Ag9V178NOHlcb1aSJGmS6fVkkVWAM9v2tV7vCpycmb+PiLdTnhqyMzAX2CczT6iflJnHRATAR4HnUpaE2Toz/9J2/R2BYyiLR0+nPGJuu8ycOrMyJqEll3wWd903NYp4qemLs8yzl+h1NiRJGrWeBoLV833bx+t1Sncew+i6rZ4qcswQaR4GPlT9aII8/uRCLrjytl5nY0y8ceMXGghKkqaEST1GUJIkSePHQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhFu91BiZaRKwDHAdsDjwGnA4cmJmP9jRjkiRJE6xRgWBELA9cBNwKbAusAnwRWBl4d+9yJkmSNPEaFQgCHwJWADbMzHsAIuIJ4NSIODwzr+1p7iRJkiZQ08YIbgVc0AoCK2cB84BZvcmSJElSbzStRXA94Dv1HZk5LyLmAOuO4DqLAcyfP39YiefNmzeCS/d74vH5LP6MBV2dO9ksePKJKXMvTzw+n3nzFuv6/G7rw1RjOfSzLArLobAcCsuh32jLohavPO3La9rChQtHdfFFSUQ8DhyUmUe17f8t8I/MfNdwrtPX17c58JtxyKIkSdJ4ec3MmTN/W9/RtBbBsXIl8Brg78CTPc6LJEnSYBYDnkeJX56iaYHg/cDyHfavANww3IvMnDlzHvDbIRNKkiRNDnM67WzaZJHrKeME/y0ipgNrMYJAUJIkaSpoWiB4HvDGiFiptu+dwPTqmCRJUmM0bbLI8sBs4BbgcPoXlL4gM11QWpIkNUqjWgQz8wHgDcAjwI+BLwFnAO/vYbYkSZJ6olEtgpIkSerXqBZBSZIk9TMQlCRJaigDQUmSpIZq2oLSEyIi1gGOAzYHHgNOBw7MzEd7mrEJFBG7ACd1OHR8Zn5kgrMzYSJibWA/4NXADOCGzJzRId0s4HPA+sAdwLGZedxE5nU8DaccIuJk4D87nL5dZv5o3DM5ASJiO+A9wExgRcqCrl8HTszMBbV0U70+DFkOTagPABHxLuDjlOfbL035vH8CHJ6ZD9bSTfU6MWQ5NKVO1EXE0pR1jVcHNs7Mq2rHdgY+BbyI8n/osMw8Y7TvaSA4xqolai4CbgW2pX+JmpWBJi5R8xbgwdrrO3uVkQnyUmBr4HJKi/vTWt0jYhPgbOC7wL7AZsCxEfF4Zp4wgXkdT0OWQ+VmSoBQd+M45mui7Uv5XbA/cBfweuArwJrVvqbUhyHLoTLV6wOUQPhSyvfCfcDLgUOr7ZuhMXViyHKoNKFO1B1Kh9gsIrYFTgGOAn4JvAP4QUQ8lJk/H80bOmt4jEXEgcDBwBqZeU+1byfgVGBGZl7by/xNlFqL4MqtcmiCiHhGWwvHRh1awn4OrJiZr6rt+wbwNmD1ekvRomqY5dBx/1QSEStn5t1t+74I/BewfGbOa0h9GE45nMwUrw8DiYgPAidSPu+5TagTnXQoh5NpUJ2IiBnAHygtpSdSaxGMiOuBazJz+1r6X1L+/7xyNO/rGMGxtxVlgep68HMWMA+Y1ZssaaIM9Qu6eqThGyjrV9adBqwKvGKcsjahpuoX1Ui1Bz+VPwFLAis2qD4MWg4TnJ3JqPV9sURT6sQA/l0OPc1F7xwPfJW2Fs+IeDGlC/30tvSnARtHxMqjeVO7hsfeesB36juqv3bnUD7IppldVdLbgJOBz2XmE73NUk+tRfkld13b/lZL8brAVTTHWhHxAPBsylN/jhqLMS+T3GsoXWH/AILm1od6ObQ0pj5ExGLAMynDKA4Gzs7MWyJifRpUJwYqh1qSRtSJiHgfsDZlSM1GbYfXq7YD1YkAOv2xNSy2CI69FYAHOuy/n2b95ft34BBgF8o4wZ8ABwHf6mGeJoMVqu0Dbfvvr7ZNqiN/okwoeQdlPO3twOnVsIIpKSI2AnYFvpSZT9LQ+tChHKB59eFeymTCqyi/L3eq9jetTgxUDtCQOhERywFHAwdk5iMdkoxrnbBFUOMiM38B/KK261cR8SBwaEQcnplzepQ1TRKZ+eW2Xf8TERcCn6W0Hk8pEbEqZZjIFcDne5ydnhmoHJpWH4DXAUtRZtV/BjgnIrboaY5643V0KIfMfLJBdeII4KbMPLUXb26L4Ni7H1i+w/4VKN0gTfbDajuVx7gMpfUX3PJt+1t/8TW9jpwJvHC0Y14mm+ov/p8DjwLbZObj1aFG1YdBymEgU7I+AGTmnzPzssz8BvBOykzqd9KwOjFIOQxkStWJiHgpsAdwUEQsX608snR1eOmIWIZxrhMGgmPvevr784F/TxBYi7I2kJptDjCftjpCWSsMrCNTTkQsSVkKZBXgLZl5b+1wY+rDEOXQdH8GFlDGiDWmTnTwZ/rLoSnWofTOXkQJ+O4HzqmOXQT8hhJXwMB1IkeTAQPBsXce8MaIWKm2753A9OpYk70bWAj09TojvZKZ84ALge3bDu1IWWPxjxOeqUkiIqZRyuXWAWaZLnIiYnFKS/jLgVmZeWv9eFPqw1DlMMA5U64+DGITyvfxzU2pEwP4dzl0OjhF68RvKa2g9Z99qmN7ALtl5t8ofwDs0HbujsCVoy0LxwiOvROBvShjGQ6nf0HpMzKzfcbPlBURv6D8MptN+QtvFvBh4NuZ2fE/+VQQEUtRlhACWANYtloIFMp/2FuBw4BLI+KblPUlNwN2B/acKsuuDFUO1fYU4AfAXyldHrtRxgu9b8IyOv6Op6z9dgCwVES8unbsusx8iAbUB4YoB0oXVxPqQ+t34wWUGZ//AjakLKp9NfDTKtmUrxNDlUNErEED6kS11NzF9X0R0fpnX+3JIgcDZ1QrkPwKeDtl4e2tR5sHA8ExlpkPRMQbKKvm/5j+R8wd0NOMTbzrgfcDz6fUs5uAA4Fje5inibAKZQxLXev1rsDJmfn7iHg7cCSwMzAX2GcKPTEAhi6HsylPnPlMlfZxSkvHNpl5DlPHltX2Cx2OvR64uCH1YahyuJpm1Acok2TeC7y4en0LcALwxcycD9CQOjFoOUTEwzSnTgwpM8+s/sD+FGUm9Rxgp9E+VQR8sogkSVJjOUZQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGur/A0w6fGYym9kTAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"QuerySamplesComplete\"])]\n",
+    "df1['delta'] = df1['ts'].diff()\n",
+    "ax = df1['delta'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time between QuerySamplesComplete (usec)');\n",
+    "plt.show()\n",
+    "\n",
+    "ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time QuerySamplesComplete (usec)');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/utils.cc b/benchmarks/rnnt/ootb/inference/loadgen/utils.cc
new file mode 100644
index 0000000..7712491
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/utils.cc
@@ -0,0 +1,126 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "utils.h"
+
+#include <chrono>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+
+#include "logging.h"
+
+namespace mlperf {
+
+std::string DoubleToString(double value, int precision) {
+  std::stringstream ss;
+  ss.precision(precision);
+  ss << std::fixed << value;
+  return ss.str();
+}
+
+bool FileExists(const std::string filename) {
+  std::ifstream file_object(filename);
+  return file_object.good();
+}
+
+namespace {
+
+std::string DateTimeString(const char* format,
+                           std::chrono::system_clock::time_point tp,
+                           bool append_ms,
+                           bool utc) {
+  std::time_t tp_time_t = std::chrono::system_clock::to_time_t(tp);
+  std::tm date_time = utc
+                          ? *std::gmtime(&tp_time_t)
+                          : *std::localtime(&tp_time_t);
+  constexpr size_t kDateTimeMaxSize = 256;
+  char date_time_cstring[kDateTimeMaxSize];
+  std::strftime(date_time_cstring, kDateTimeMaxSize, format, &date_time);
+  std::string date_time_string(date_time_cstring);
+  if (!append_ms) {
+    return date_time_string;
+  }
+
+  auto tp_time_t_part = std::chrono::system_clock::from_time_t(tp_time_t);
+  auto tp_remainder = tp - tp_time_t_part;
+  auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp_remainder)
+                .count();
+  if (ms < 0 || ms >= 1000) {
+    LogDetail([ms](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "WARNING: Unexpected milliseconds getting date and time."
+         << " ms: " << ms;
+      MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
+#else
+      detail("WARNING: Unexpected milliseconds getting date and time.", "ms",
+             ms);
+#endif
+    });
+  }
+  std::string ms_string = std::to_string(ms);
+  // Prefix with zeros so length is always 3.
+  ms_string.insert(0, std::min<size_t>(2, 3 - ms_string.length()), '0');
+  return date_time_string + "." + ms_string;
+}
+
+}  // namespace
+
+std::string CurrentDateTimeISO8601() {
+  return DateTimeString("%FT%TZ", std::chrono::system_clock::now(), false,
+                        false);
+}
+
+std::string DateTimeStringForPower(std::chrono::system_clock::time_point tp) {
+  return DateTimeString("%m-%d-%Y %T", tp, true, true);
+}
+
+std::string EscapeStringJson(const std::string& in) {
+  std::stringstream ss;
+  for (auto c = in.cbegin(); c != in.cend(); c++) {
+    int c_val = static_cast<int>(*c);
+    switch (*c) {
+      case '"':
+        ss << "\\\"";
+        break;
+      case '\\':
+        ss << "\\\\";
+        break;
+      case '\b':
+        ss << "\\b";
+        break;
+      case '\f':
+        ss << "\\f";
+        break;
+      case '\n':
+        ss << "\\n";
+        break;
+      case '\r':
+        ss << "\\r";
+        break;
+      case '\t':
+        ss << "\\t";
+        break;
+      default:
+        if (c_val >= 0x00 && c_val < 0x20) {
+          ss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << c_val;
+        } else {
+          ss << *c;
+        }
+    }
+  }
+  return ss.str();
+}
+
+}  // namespace mlperf
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/utils.h b/benchmarks/rnnt/ootb/inference/loadgen/utils.h
new file mode 100644
index 0000000..c587e0c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/utils.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Various shared utility functions.
+
+#ifndef MLPERF_LOADGEN_UTILS_H
+#define MLPERF_LOADGEN_UTILS_H
+
+#include <algorithm>
+#include <chrono>
+#include <string>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+template <typename T>
+void RemoveValue(T* container, const typename T::value_type& value_to_remove) {
+  container->erase(std::remove_if(container->begin(), container->end(),
+                                  [&](typename T::value_type v) {
+                                    return v == value_to_remove;
+                                  }),
+                   container->end());
+}
+
+template <typename CountT, typename RatioT>
+double DurationToSeconds(
+    const std::chrono::duration<CountT, RatioT>& chrono_duration) {
+  return std::chrono::duration_cast<std::chrono::duration<double>>(
+             chrono_duration)
+      .count();
+}
+
+inline double QuerySampleLatencyToSeconds(QuerySampleLatency qsl) {
+  return static_cast<double>(qsl) / std::nano::den;
+}
+
+template <typename DurationT>
+inline DurationT SecondsToDuration(double seconds) {
+  return std::chrono::duration_cast<DurationT>(
+      std::chrono::duration<double>(seconds));
+}
+
+std::string CurrentDateTimeISO8601();
+
+/// \brief Uses a format that matches the one used by SPEC power
+/// measurement logging.
+std::string DateTimeStringForPower(std::chrono::system_clock::time_point tp);
+
+std::string DoubleToString(double value, int precision = 2);
+
+bool FileExists(const std::string filename);
+
+// \brief Escape special characters in a string for JSON.
+// Don't use this in performance critical path.
+std::string EscapeStringJson(const std::string& in);
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_UTILS_H
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/version.cc b/benchmarks/rnnt/ootb/inference/loadgen/version.cc
new file mode 100644
index 0000000..3216c9d
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/version.cc
@@ -0,0 +1,85 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Non-generated version logic.
+
+#include "version.h"
+
+#include "logging.h"
+#include "utils.h"
+
+namespace mlperf {
+
+/// Helper function to split a string based on a delimiting character.
+std::vector<std::string> splitString(const std::string& input,
+                                     const std::string& delimiter) {
+  std::vector<std::string> result;
+  size_t start = 0;
+  size_t next = 0;
+  while (next != std::string::npos) {
+    next = input.find(delimiter, start);
+    result.emplace_back(input, start, next - start);
+    start = next + 1;
+  }
+  return result;
+}
+
+/// Converts the hash-filename pairs to a dict.
+std::map<std::string, std::string> LoadgenSha1OfFilesToDict(
+    const std::string& in) {
+  std::map<std::string, std::string> result;
+  auto files = splitString(in, "\n");
+  for (const auto& file : files) {
+    auto hash_and_name = splitString(file, " ");
+    assert(hash_and_name.size() > 1);
+    result[hash_and_name[1]] = hash_and_name[0];
+  }
+  return result;
+}
+
+void LogLoadgenVersion() {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "loadgen_version",
+               LoadgenVersion() + " @ " + LoadgenGitRevision());
+    MLPERF_LOG(detail, "loadgen_build_date_local", LoadgenBuildDateLocal());
+    MLPERF_LOG(detail, "loadgen_build_date_utc", LoadgenBuildDateUtc());
+    MLPERF_LOG(detail, "loadgen_git_commit_date", LoadgenGitCommitDate());
+    MLPERF_LOG(detail, "loadgen_git_log_message",
+               EscapeStringJson(LoadgenGitLog()));
+    MLPERF_LOG(detail, "loadgen_git_status_message",
+               EscapeStringJson(LoadgenGitStatus()));
+    if (!LoadgenGitStatus().empty() && LoadgenGitStatus() != "NA") {
+      MLPERF_LOG_ERROR(detail, "error_uncommitted_loadgen_changes",
+                       "Loadgen built with uncommitted changes!");
+      ;
+    }
+    MLPERF_LOG(detail, "loadgen_file_sha1",
+               LoadgenSha1OfFilesToDict(LoadgenSha1OfFiles()));
+#else
+    detail("LoadgenVersionInfo:");
+    detail("version : " + LoadgenVersion() + " @ " + LoadgenGitRevision());
+    detail("build_date_local : " + LoadgenBuildDateLocal());
+    detail("build_date_utc   : " + LoadgenBuildDateUtc());
+    detail("git_commit_date  : " + LoadgenGitCommitDate());
+    detail("git_log :\n\n" + LoadgenGitLog() + "\n");
+    detail("git_status :\n\n" + LoadgenGitStatus() + "\n");
+    if (!LoadgenGitStatus().empty() && LoadgenGitStatus() != "NA") {
+      detail.Error("Loadgen built with uncommitted changes!");
+    }
+    detail("SHA1 of files :\n\n" + LoadgenSha1OfFiles() + "\n");
+#endif
+  });
+}
+
+}  // namespace mlperf
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/version.h b/benchmarks/rnnt/ootb/inference/loadgen/version.h
new file mode 100644
index 0000000..87c3409
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/version.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Declares the version-related strings that will be defined in
+/// a version_generated.cc as created by version_generator.py.
+
+#ifndef MLPERF_LOADGEN_VERSION_H
+#define MLPERF_LOADGEN_VERSION_H
+
+#include <string>
+
+namespace mlperf {
+
+// Non-generated.
+void LogLoadgenVersion();
+
+// Definitions generated at compile time.
+const std::string& LoadgenVersion();
+const std::string& LoadgenGitRevision();
+const std::string& LoadgenBuildDateLocal();
+const std::string& LoadgenBuildDateUtc();
+const std::string& LoadgenGitCommitDate();
+const std::string& LoadgenGitStatus();
+const std::string& LoadgenGitLog();
+const std::string& LoadgenSha1OfFiles();
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_VERSION_H
diff --git a/benchmarks/rnnt/ootb/inference/loadgen/version_generator.py b/benchmarks/rnnt/ootb/inference/loadgen/version_generator.py
new file mode 100644
index 0000000..c37c4c4
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/loadgen/version_generator.py
@@ -0,0 +1,126 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+## \file
+#  \brief A script run by the build to generate the version definitions
+#  expected at link time.
+
+import datetime
+import errno
+import hashlib
+import os
+import sys
+
+
+# Creates a C++ raw string literal using a delimiter that is very
+# unlikely to show up in a git stats.
+def make_raw_string(str) :
+    delimeter = "LGVG_RSLD"
+    return "R\"" + delimeter + "(" + str + ")" + delimeter + "\""
+
+def func_def(name, string):
+    return ("const std::string& Loadgen" + name + "() {\n" +
+            "  static const std::string str = " + string + ";\n" +
+            "  return str;\n" +
+            "}\n\n")
+
+
+# For clients that build the loadgen from the git respository without
+# any modifications.
+def generate_loadgen_version_definitions_git(ofile, git_command):
+    git_rev = os.popen(git_command + "rev-parse --short=10 HEAD").read()
+    git_commit_date = os.popen(git_command + "log --format=\"%cI\" -n 1").read()
+    git_status = os.popen(git_command + "status -s -uno").read()
+    git_log = os.popen(
+        git_command + "log --pretty=oneline -n 16 --no-decorate").read()
+    ofile.write(func_def("GitRevision", "\"" + git_rev[0:-1] + "\""))
+    ofile.write(func_def("GitCommitDate", "\"" + git_commit_date[0:-1] + "\""))
+    ofile.write(func_def("GitStatus", make_raw_string(git_status[0:-1])))
+    ofile.write(func_def("GitLog", make_raw_string(git_log[0:-1])))
+
+
+# For clients that might not import the loadgen code as the original git
+# repository.
+def generate_loadgen_verstion_definitions_git_stubs(ofile):
+    na = "\"NA\""
+    ofile.write(func_def("GitRevision", na))
+    ofile.write(func_def("GitCommitDate", na))
+    ofile.write(func_def("GitStatus", na))
+    ofile.write(func_def("GitLog", na))
+
+
+# Always log the sha1 of the loadgen files, regardless of whether we are
+# in the original git repository or not.
+def generate_loadgen_version_definitions_sha1(ofile, loadgen_root):
+    """Writes definition for Sha1OfFiles."""
+    sha1s = ""
+    loadgen_files = (
+        ["/bindings/" + s for s in os.listdir(loadgen_root + "/bindings")] +
+        ["/demos/" + s for s in os.listdir(loadgen_root + "/demos")] +
+        ["/" + s for s in os.listdir(loadgen_root)])
+    for fn in sorted(loadgen_files):
+        full_fn = loadgen_root + fn
+        if not os.path.isfile(full_fn):
+            continue
+        file_data = open(full_fn, "rb").read()
+        sha1s += hashlib.sha1(file_data).hexdigest() + " " + fn + "\n"
+
+    ofile.write(func_def("Sha1OfFiles", make_raw_string(sha1s[0:-1])))
+
+
+# Outputs version function definitions to cc_filename.
+# Includes SHA1's of the relevant dirs in the loadgen_root directory.
+def generate_loadgen_version_definitions(cc_filename, loadgen_root):
+    """Generates the C++ source file with the loadgen version info."""
+    try:
+        os.makedirs(os.path.dirname(cc_filename))
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+    ofile = open(cc_filename, "w")
+    ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n")
+    ofile.write("#include <string>\n\n")
+    ofile.write("namespace mlperf {\n\n")
+    ofile.write(func_def("Version", "\"1.1\""))
+
+    date_time_now_local = datetime.datetime.now().isoformat()
+    date_time_now_utc = datetime.datetime.utcnow().isoformat()
+    ofile.write(func_def("BuildDateLocal", "\"" + date_time_now_local + "\""))
+    ofile.write(func_def("BuildDateUtc", "\"" + date_time_now_utc + "\""))
+
+    git_dir = "--git-dir=\"" + loadgen_root + "/../.git\" "
+    git_work_tree = "--work-tree=\"" + loadgen_root + "/..\" "
+    git_command = "git " + git_dir + git_work_tree
+    git_status = os.popen(git_command + "status")
+    git_status.read()
+    is_git_repo = git_status.close() is None
+    if is_git_repo:
+        generate_loadgen_version_definitions_git(ofile, git_command)
+    else:
+        generate_loadgen_verstion_definitions_git_stubs(ofile)
+    generate_loadgen_version_definitions_sha1(ofile, loadgen_root)
+
+    ofile.write("}  // namespace mlperf\n")
+    ofile.close()
+
+
+def main():
+    if len(sys.argv) != 3:
+        raise ValueError("Incorrect command-line arguments.")
+    generate_loadgen_version_definitions(sys.argv[1], sys.argv[2])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/rnnt/ootb/inference/optional_harness_ck/README.md b/benchmarks/rnnt/ootb/inference/optional_harness_ck/README.md
new file mode 100644
index 0000000..896cdf7
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/optional_harness_ck/README.md
@@ -0,0 +1,303 @@
+# MLPerf Inference - Speech Recognition - RNN-T
+
+We describe an automated and reproducible workflow for the [RNN-T
+workload](https://github.com/mlperf/inference/tree/master/v0.7/speech_recognition/rnnt)
+implemented using the [Collective Knowledge](http://cknowledge.org) technology. It automatically
+downloads the model and the dataset, preprocesses the dataset, builds the LoadGen API, etc.
+For any questions or questions, please email info@dividiti.com or simply [open an issue](https://github.com/mlperf/inference/issues) on GitHub.
+
+**NB:** Below we give an _essential_ sequence of steps that should result in a successful setup
+of the RNN-T workflow on a minimally configured Linux system.
+
+The steps are extracted from a [minimalistic Amazon Linux
+2](https://github.com/ctuning/ck-mlperf/blob/master/docker/speech-recognition.rnnt/Dockerfile.amazonlinux.min)
+Docker image, which is derived from a more verbose [Amazon Linux
+2](https://github.com/ctuning/ck-mlperf/blob/master/docker/speech-recognition.rnnt/Dockerfile.amazonlinux)
+Docker image by omitting steps that the [Collective Knowledge
+framework](https://github.com/ctuning/ck) performs automatically.
+
+For example, installing the preprocessed dataset is explicit in the verbose image:
+```
+#-----------------------------------------------------------------------------#
+# Step 3. Download the official MLPerf Inference RNNT dataset (LibriSpeech
+# dev-clean) and preprocess it to wav.
+#-----------------------------------------------------------------------------#
+RUN ck install package --tags=dataset,speech-recognition,dev-clean,original
+# NB: Can ignore the lzma related warning.
+RUN ck install package --tags=dataset,speech-recognition,dev-clean,preprocessed
+#-----------------------------------------------------------------------------#
+```
+but is implicit in the minimalistic image:
+```
+#- #-----------------------------------------------------------------------------#
+#- # Step 3. Download the official MLPerf Inference RNNT dataset (LibriSpeech
+#- # dev-clean) and preprocess it to wav.
+#- #-----------------------------------------------------------------------------#
+#- RUN ck install package --tags=dataset,speech-recognition,dev-clean,original
+#- # NB: Can ignore the  lzma related warning.
+#- RUN ck install package --tags=dataset,speech-recognition,dev-clean,preprocessed
+#- #-----------------------------------------------------------------------------#
+```
+because it's going to be triggered by a test performance run:
+```
+#+ #-----------------------------------------------------------------------------#
+#+ # Step 6. Pull all the implicit dependencies commented out in Steps 1-5.
+#+ #-----------------------------------------------------------------------------#
+RUN ck run program:speech-recognition-pytorch-loadgen --cmd_key=performance --skip_print_timers
+#+ #-----------------------------------------------------------------------------#
+```
+(Omitted steps are commented out with `#- `. Added steps are commented with `#+ `.)
+
+For other possible variations and workarounds see the [complete
+collection](https://github.com/ctuning/ck-mlperf/blob/master/docker/speech-recognition.rnnt/README.md)
+of Docker images for this workflow including Ubuntu, Debian and CentOS.
+
+# Table of Contents
+
+1. [Installation](#install)
+    1. Install [system-wide prerequisites](#install_system)
+        1. [Ubuntu 20.04 or similar](#install_system_ubuntu)
+        1. [CentOS 7 or similar](#install_system_centos_7)
+        1. [CentOS 8 or similar](#install_system_centos_8)
+    1. Install [Collective Knowledge](#install_ck) (CK) and its repositories
+    1. Detect [GCC](#detect_gcc)
+    1. Detect [Python](#detect_python)
+    1. Install [Python dependencies](#install_python_deps)
+    1. Install a branch of the [MLPerf Inference](#install_inference_repo) repo
+1. [Usage](#usage)
+    1. [Performance](#usage_performance)
+    1. [Accuracy](#usage_performance)
+
+<a name="install"></a>
+## Installation
+
+<a name="install_system"></a>
+### Install system-wide prerequisites
+
+**NB:** Run the below commands for your Linux system with `sudo` or as superuser.
+
+<a name="install_system_ubuntu"></a>
+#### Ubuntu 20.04 or similar
+```bash
+$ sudo apt update -y
+$ sudo apt install -y apt-utils
+$ sudo apt upgrade -y
+$ sudo apt install -y\
+ python3 python3-pip\
+ gcc g++\
+ make patch vim\
+ git wget zip libz-dev\
+ libsndfile1-dev
+$ sudo apt clean
+```
+
+<a name="install_system_centos_7"></a>
+#### CentOS 7 or similar
+```bash
+$ sudo yum upgrade -y
+$ sudo yum install -y\
+ python3 python3-pip python3-devel\
+ gcc gcc-c++\
+ make which patch vim\
+ git wget zip unzip\
+ tar xz\
+ libsndfile-devel
+$ sudo yum clean all
+```
+
+<a name="install_system_centos_8"></a>
+#### CentOS 8 or similar
+```bash
+$ sudo yum upgrade -y
+$ sudo yum install -y\
+ gcc gcc-c++\
+ make which patch vim\
+ git wget zip unzip\
+ openssl-devel bzip2-devel libffi-devel\
+$ sudo yum clean all
+$ sudo dnf install -y python3 python3-pip python3-devel
+$ sudo dnf --enablerepo=PowerTools install -y libsndfile-devel
+```
+
+
+<a name="install_ck"></a>
+### Install [Collective Knowledge](http://cknowledge.org/) (CK) and its repositories
+
+```bash
+$ export CK_PYTHON=/usr/bin/python3
+$ $CK_PYTHON -m pip install --ignore-installed pip setuptools --user
+$ $CK_PYTHON -m pip install ck
+$ ck version
+V1.15.0
+$ ck pull repo:ck-mlperf
+$ ck pull repo:ck-pytorch
+```
+
+<a name="detect_gcc"></a>
+### Detect (system) GCC
+```
+$ export CK_CC=/usr/bin/gcc
+$ ck detect soft:compiler.gcc --full_path=$CK_CC
+$ ck show env --tags=compiler,gcc
+Env UID:         Target OS: Bits: Name:          Version: Tags:
+
+b8bd7b49f72f9794   linux-64    64 GNU C compiler 7.3.1    64bits,compiler,gcc,host-os-linux-64,lang-c,lang-cpp,target-os-linux-64,v7,v7.3,v7.3.1
+```
+**NB:** Required to build the FLAC and SoX dependencies of preprocessing. CK can normally detect compilers automatically, but we are playing safe here.
+
+<a name="detect_python"></a>
+### Detect (system) Python
+```
+$ export CK_PYTHON=/usr/bin/python3
+$ ck detect soft:compiler.python --full_path=$CK_PYTHON
+$ ck show env --tags=compiler,python
+Env UID:         Target OS: Bits: Name:  Version: Tags:
+
+633a6b22205eb07f   linux-64    64 python 3.7.6    64bits,compiler,host-os-linux-64,lang-python,python,target-os-linux-64,v3,v3.7,v3.7.6
+```
+**NB:** CK can normally detect available Python interpreters automatically, but we are playing safe here.
+
+<a name="install_python_deps"></a>
+### Install Python dependencies (in userspace)
+
+#### Install implicit dependencies via pip
+```bash
+$ export CK_PYTHON=/usr/bin/python3
+$ $CK_PYTHON -m pip install --user --upgrade \
+  tqdm wheel toml unidecode inflect sndfile librosa numba==0.48
+...
+Successfully installed inflect-4.1.0 librosa-0.7.2 llvmlite-0.31.0 numba-0.48.0 sndfile-0.2.0 unidecode-1.1.1 wheel-0.34.2
+```
+**NB:** These dependencies are _implicit_, i.e. CK will not try to satisfy them. If they are not installed, however, the workflow will fail.
+
+
+#### Install explicit dependencies via CK (also via `pip`, but register with CK at the same time)
+```bash
+$ ck install package --tags=python-package,torch
+$ ck install package --tags=python-package,pandas
+$ ck install package --tags=python-package,sox
+$ ck install package --tags=python-package,absl
+```
+**NB:** These dependencies are _explicit_, i.e. CK will try to satisfy them automatically. On a machine with multiple versions of Python, things can get messy, so we are playing safe here.
+
+<a name="install_inference_repo"></a>
+### Install an MLPerf Inference [branch](https://github.com/dividiti/inference/tree/dvdt-rnnt) with [dividiti](http://dividiti.com)'s tweaks for RNN-T
+```bash
+$ ck install package --tags=mlperf,inference,source,dividiti.rnnt
+```
+**NB:** This source will be used for building LoadGen as well.
+
+
+<a name="usage"></a>
+## Usage
+
+<a name="usage_performance"></a>
+### Running a performance test
+
+The first run will end up resolving all the remaining explicit dependencies:
+- preprocessing the LibriSpeech Dev-Clean dataset to wav;
+- building the LoadGen API;
+- downloading the PyTorch model.
+
+It's a performance run which should print something like:
+```
+$ ck run program:speech-recognition-pytorch-loadgen --cmd_key=performance --skip_print_timers
+...
+Dataset loaded with 4.36 hours. Filtered 1.02 hours. Number of samples: 2513
+Running Loadgen test...
+Average latency (ms) per query:
+7335.167247106061
+Median latency (ms):
+7391.662108
+90 percentile latency (ms):
+13347.925176
+================================================
+MLPerf Results Summary
+================================================
+SUT name : PySUT
+Scenario : Offline
+Mode     : Performance
+Samples per second: 4.63626
+Result is : INVALID
+  Min duration satisfied : NO
+  Min queries satisfied : Yes
+Recommendations:
+ * Increase expected QPS so the loadgen pre-generates a larger (coalesced) query.
+
+================================================
+Additional Stats
+================================================
+Min latency (ns)                : 278432559
+Max latency (ns)                : 14235613054
+Mean latency (ns)               : 7335167247
+50.00 percentile latency (ns)   : 7521181269
+90.00 percentile latency (ns)   : 13402430910
+95.00 percentile latency (ns)   : 13723706550
+97.00 percentile latency (ns)   : 14054764438
+99.00 percentile latency (ns)   : 14235613054
+99.90 percentile latency (ns)   : 14235613054
+
+================================================
+Test Parameters Used
+================================================
+samples_per_query : 66
+target_qps : 1
+target_latency (ns): 0
+max_async_queries : 1
+min_duration (ms): 60000
+max_duration (ms): 0
+min_query_count : 1
+max_query_count : 0
+qsl_rng_seed : 3133965575612453542
+sample_index_rng_seed : 665484352860916858
+schedule_rng_seed : 3622009729038561421
+accuracy_log_rng_seed : 0
+accuracy_log_probability : 0
+print_timestamps : false
+performance_issue_unique : false
+performance_issue_same : false
+performance_issue_same_index : 0
+performance_sample_count : 2513
+
+No warnings encountered during test.
+
+No errors encountered during test.
+Done!
+
+Execution time: 38.735 sec.
+```
+
+The above output is the contents of `mlperf_log_summary.txt`, one of the log files generated by LoadGen. All LoadGen log files can be located in the program's temporary directory:
+```bash
+$ cd `ck find program:speech-recognition-pytorch-loadgen`/tmp && ls -la mlperf_log_*
+-rw-r--r-- 1 anton eng      4 Jul  3 18:06 mlperf_log_accuracy.json
+-rw-r--r-- 1 anton eng  20289 Jul  3 18:06 mlperf_log_detail.txt
+-rw-r--r-- 1 anton eng   1603 Jul  3 18:06 mlperf_log_summary.txt
+-rw-r--r-- 1 anton eng 860442 Jul  3 18:06 mlperf_log_trace.json
+```
+
+<a name="usage_accuracy"></a>
+### Running an accuracy test
+
+```
+$ ck run program:speech-recognition-pytorch-loadgen --cmd_key=accuracy --skip_print_timers
+...
+Dataset loaded with 4.36 hours. Filtered 1.02 hours. Number of samples: 2513
+Running Loadgen test...
+
+No warnings encountered during test.
+
+No errors encountered during test.
+Running accuracy script: /usr/bin/python3 /disk1/homes/anton/CK-TOOLS/mlperf-inference-dividiti.rnnt/inference/v0.7/speech_recognition/rnnt/accuracy_eval.py --log_dir /disk1/homes/anton/CK/ck-mlperf/program/speech-recognition-pytorch-loadgen/tmp --dataset_dir /homes/anton/CK-TOOLS/dataset-librispeech-preprocessed-to-wav-dev-clean/../ --manifest /homes/anton/CK-TOOLS/dataset-librispeech-preprocessed-to-wav-dev-clean/wav-list.json
+Dataset loaded with 4.36 hours. Filtered 1.02 hours. Number of samples: 2513
+Word Error Rate: 0.07452253714852645
+Done!
+
+Execution time: 502.197 sec.
+
+$ cd `ck find program:speech-recognition-pytorch-loadgen`/tmp && ls -la mlperf_log_*
+-rw-r--r-- 1 anton eng  3862427 Jul  3 18:00 mlperf_log_accuracy.json
+-rw-r--r-- 1 anton eng    20126 Jul  3 18:00 mlperf_log_detail.txt
+-rw-r--r-- 1 anton eng       74 Jul  3 18:00 mlperf_log_summary.txt
+-rw-r--r-- 1 anton eng 29738248 Jul  3 18:00 mlperf_log_trace.json
+```
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/Dockerfile b/benchmarks/rnnt/ootb/inference/pytorch/Dockerfile
new file mode 100755
index 0000000..1cb52bf
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/Dockerfile
@@ -0,0 +1,46 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3
+FROM ${FROM_IMAGE_NAME}
+
+
+RUN apt-get update && apt-get install -y libsndfile1 && apt-get install -y sox && rm -rf /var/lib/apt/lists/*
+
+RUN COMMIT_SHA=c6d12f9e1562833c2b4e7ad84cb22aa4ba31d18c && \
+    git clone https://github.com/HawkAaron/warp-transducer deps/warp-transducer && \
+    cd deps/warp-transducer && \
+    git checkout $COMMIT_SHA && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make VERBOSE=1 && \
+	export CUDA_HOME="/usr/local/cuda" && \
+    export WARP_RNNT_PATH=`pwd` && \
+    export CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME && \
+    export LD_LIBRARY_PATH="$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH" && \
+    export LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH && \
+    export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH && \
+    export CFLAGS="-I$CUDA_HOME/include $CFLAGS" && \
+    cd ../pytorch_binding && \
+    python3 setup.py install --user && \
+    rm -rf ../tests test ../tensorflow_binding && \
+    cd ../../..
+
+WORKDIR /workspace/jasper
+
+COPY requirements.txt .
+RUN pip install --disable-pip-version-check -U -r requirements.txt
+
+COPY . .
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/LICENSE b/benchmarks/rnnt/ootb/inference/pytorch/LICENSE
new file mode 100644
index 0000000..75ee157
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/LICENSE
@@ -0,0 +1,204 @@
+   Except where otherwise noted, the following license applies to all files in this repo. 
+        
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019 NVIDIA Corporation
+   Copyright 2019 Myrtle Software Limited, www.myrtle.ai
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/NOTICE b/benchmarks/rnnt/ootb/inference/pytorch/NOTICE
new file mode 100644
index 0000000..7916839
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/NOTICE
@@ -0,0 +1,5 @@
+Jasper in PyTorch
+
+This repository includes source code (in "parts/") from:
+* https://github.com/keithito/tacotron and https://github.com/ryanleary/patter licensed under MIT license.
+
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/configs/rnnt.toml b/benchmarks/rnnt/ootb/inference/pytorch/configs/rnnt.toml
new file mode 100644
index 0000000..a4cd1df
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/configs/rnnt.toml
@@ -0,0 +1,77 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model = "RNNT"
+
+[input]
+normalize = "per_feature"
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+window = "hann"
+features = 80
+n_fft = 512
+frame_splicing = 3
+dither = 0.00001
+feat_type = "logfbank"
+normalize_transcripts = true
+trim_silence = true
+pad_to = 0   # TODO
+max_duration = 16.7
+speed_perturbation = true
+
+
+cutout_rect_regions = 0
+cutout_rect_time = 60
+cutout_rect_freq = 25
+
+
+cutout_x_regions = 2
+cutout_y_regions = 2
+cutout_x_width = 6
+cutout_y_width = 6
+
+
+[input_eval]
+normalize = "per_feature"
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+window = "hann"
+features = 80
+n_fft = 512
+frame_splicing = 3
+dither = 0.00001
+feat_type = "logfbank"
+normalize_transcripts = true
+trim_silence = true
+pad_to = 0
+
+
+[rnnt]
+rnn_type = "lstm"
+encoder_n_hidden = 1024
+encoder_pre_rnn_layers = 2
+encoder_stack_time_factor = 2
+encoder_post_rnn_layers = 3
+pred_n_hidden = 320
+pred_rnn_layers = 2
+forget_gate_bias = 1.0
+joint_n_hidden = 512
+dropout=0.32
+
+
+[labels]
+labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/dataset.py b/benchmarks/rnnt/ootb/inference/pytorch/dataset.py
new file mode 100644
index 0000000..7b9036f
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/dataset.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file contains classes and functions related to data loading
+"""
+from collections import namedtuple
+import torch
+import numpy as np
+from torch.utils.data import Dataset
+from parts.manifest import Manifest
+from parts.features import WaveformFeaturizer
+
+
+def seq_collate_fn(batch):
+    """batches samples and returns as tensors
+    Args:
+    batch : list of samples
+    Returns
+    batches of tensors
+    """
+    audio_lengths = torch.LongTensor([sample.waveform.size(0)
+                                      for sample in batch])
+    transcript_lengths = torch.LongTensor([sample.transcript.size(0)
+                                           for sample in batch])
+    permute_indices = torch.argsort(audio_lengths, descending=True)
+
+    audio_lengths = audio_lengths[permute_indices]
+    transcript_lengths = transcript_lengths[permute_indices]
+    padded_audio_signals = torch.nn.utils.rnn.pad_sequence(
+        [batch[i].waveform for i in permute_indices],
+        batch_first=True
+    )
+    transcript_list = [batch[i].transcript
+                       for i in permute_indices]
+    packed_transcripts = torch.nn.utils.rnn.pack_sequence(transcript_list,
+                                                          enforce_sorted=False)
+
+    # TODO: Don't I need to stop grad at some point now?
+    return (padded_audio_signals, audio_lengths, transcript_list,
+            packed_transcripts, transcript_lengths)
+
+
+class AudioToTextDataLayer:
+    """Data layer with data loader
+    """
+
+    def __init__(self, **kwargs):
+        featurizer_config = kwargs['featurizer_config']
+        pad_to_max = kwargs.get('pad_to_max', False)
+        perturb_config = kwargs.get('perturb_config', None)
+        manifest_filepath = kwargs['manifest_filepath']
+        dataset_dir = kwargs['dataset_dir']
+        labels = kwargs['labels']
+        batch_size = kwargs['batch_size']
+        drop_last = kwargs.get('drop_last', False)
+        shuffle = kwargs.get('shuffle', True)
+        min_duration = featurizer_config.get('min_duration', 0.1)
+        max_duration = featurizer_config.get('max_duration', None)
+        normalize_transcripts = kwargs.get('normalize_transcripts', True)
+        trim_silence = kwargs.get('trim_silence', False)
+        sampler_type = kwargs.get('sampler', 'default')
+        speed_perturbation = featurizer_config.get('speed_perturbation', False)
+        sort_by_duration = sampler_type == 'bucket'
+        self._featurizer = WaveformFeaturizer.from_config(
+            featurizer_config, perturbation_configs=perturb_config)
+        self._dataset = AudioDataset(
+            dataset_dir=dataset_dir,
+            manifest_filepath=manifest_filepath,
+            labels=labels, blank_index=len(labels),
+            sort_by_duration=sort_by_duration,
+            pad_to_max=pad_to_max,
+            featurizer=self._featurizer, max_duration=max_duration,
+            min_duration=min_duration, normalize=normalize_transcripts,
+            trim=trim_silence, speed_perturbation=speed_perturbation)
+
+        print('sort_by_duration', sort_by_duration)
+
+        self._dataloader = torch.utils.data.DataLoader(
+            dataset=self._dataset,
+            batch_size=batch_size,
+            collate_fn=lambda b: seq_collate_fn(b),
+            drop_last=drop_last,
+            shuffle=shuffle,
+            num_workers=0,
+            pin_memory=True,
+            sampler=None
+        )
+
+    def __len__(self):
+        return len(self._dataset)
+
+    @property
+    def data_iterator(self):
+        return self._dataloader
+
+
+class AudioDataset(Dataset):
+    def __init__(self, dataset_dir, manifest_filepath, labels, featurizer, max_duration=None, pad_to_max=False,
+                 min_duration=None, blank_index=0, max_utts=0, normalize=True, sort_by_duration=False,
+                 trim=False, speed_perturbation=False):
+        """Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations
+        (in seconds). Each entry is a different audio sample.
+        Args:
+            dataset_dir: absolute path to dataset folder
+            manifest_filepath: relative path from dataset folder to manifest json as described above.
+            labels: String containing all the possible characters to map to
+            featurizer: Initialized featurizer class that converts paths of audio to feature tensors
+            max_duration: If audio exceeds this length, do not include in dataset
+            min_duration: If audio is less than this length, do not include in dataset
+            pad_to_max: if specified input sequences into dnn model will be padded to max_duration
+            blank_index: blank index for ctc loss / decoder
+            max_utts: Limit number of utterances
+            normalize: whether to normalize transcript text
+            sort_by_duration: whether or not to sort sequences by increasing duration
+            trim: if specified trims leading and trailing silence from an audio signal.
+            speed_perturbation: specify if using data contains speed perburbation
+        """
+        m_paths = [manifest_filepath]
+        self.manifest = Manifest(dataset_dir, m_paths, labels, blank_index, pad_to_max=pad_to_max,
+                                 max_duration=max_duration,
+                                 sort_by_duration=sort_by_duration,
+                                 min_duration=min_duration, max_utts=max_utts,
+                                 normalize=normalize, speed_perturbation=speed_perturbation)
+        self.featurizer = featurizer
+        self.blank_index = blank_index
+        self.trim = trim
+        print(
+            "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours.".format(
+                self.manifest.duration / 3600,
+                self.manifest.filtered_duration / 3600))
+
+    def __getitem__(self, index):
+        sample = self.manifest[index]
+        rn_indx = np.random.randint(len(sample['audio_filepath']))
+        duration = sample['audio_duration'][rn_indx] if 'audio_duration' in sample else 0
+        offset = sample['offset'] if 'offset' in sample else 0
+        features = self.featurizer.process(sample['audio_filepath'][rn_indx],
+                                           offset=offset, duration=duration,
+                                           trim=self.trim)
+
+        AudioSample = namedtuple('AudioSample', ['waveform',
+                                                 'transcript'])
+        return AudioSample(features,
+                           torch.LongTensor(sample["transcript"]))
+
+    def __len__(self):
+        return len(self.manifest)
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/decoders.py b/benchmarks/rnnt/ootb/inference/pytorch/decoders.py
new file mode 100644
index 0000000..7f6d405
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/decoders.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple
+
+import torch
+
+import torch.nn.functional as F
+from model_separable_rnnt import label_collate
+
+
+class ScriptGreedyDecoder(torch.nn.Module):
+    """A greedy transducer decoder.
+
+    Args:
+        blank_symbol: See `Decoder`.
+        model: Model to use for prediction.
+        max_symbols_per_step: The maximum number of symbols that can be added
+            to a sequence in a single time step; if set to None then there is
+            no limit.
+        cutoff_prob: Skip to next step in search if current highest character
+            probability is less than this.
+    """
+
+    def __init__(self, blank_index, model, max_symbols_per_step=30):
+        super().__init__()
+        assert isinstance(model, torch.jit.ScriptModule)
+        # assert not model.training
+        self.eval()
+        self._model = model
+        self._blank_id = blank_index
+        self._SOS = -1
+        assert max_symbols_per_step > 0
+        self._max_symbols_per_step = max_symbols_per_step
+
+    @torch.jit.export
+    def forward(self, x: torch.Tensor, out_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, List[List[int]]]:
+        """Returns a list of sentences given an input batch.
+
+        Args:
+            x: A tensor of size (batch, channels, features, seq_len)
+                TODO was (seq_len, batch, in_features).
+            out_lens: list of int representing the length of each sequence
+                output sequence.
+
+        Returns:
+            list containing batch number of sentences (strings).
+        """
+        # Apply optional preprocessing
+
+        logits, logits_lens = self._model.encoder(x, out_lens)
+
+        output: List[List[int]] = []
+        for batch_idx in range(logits.size(0)):
+            inseq = logits[batch_idx, :, :].unsqueeze(1)
+            # inseq: TxBxF
+            logitlen = logits_lens[batch_idx]
+            sentence = self._greedy_decode(inseq, logitlen)
+            output.append(sentence)
+
+        return logits, logits_lens, output
+
+    def _greedy_decode(self, x: torch.Tensor, out_len: torch.Tensor) -> List[int]:
+        hidden: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+        label: List[int] = []
+        for time_idx in range(int(out_len.item())):
+            f = x[time_idx, :, :].unsqueeze(0)
+
+            not_blank = True
+            symbols_added = 0
+
+            while not_blank and symbols_added < self._max_symbols_per_step:
+                g, hidden_prime = self._pred_step(
+                    self._get_last_symb(label),
+                    hidden
+                )
+                logp = self._joint_step(f, g, log_normalize=False)[0, :]
+
+                # get index k, of max prob
+                v, k = logp.max(0)
+                k = k.item()
+
+                if k == self._blank_id:
+                    not_blank = False
+                else:
+                    label.append(k)
+                    hidden = hidden_prime
+                symbols_added += 1
+
+        return label
+
+    def _pred_step(self, label: int, hidden: Optional[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if label == self._SOS:
+            return self._model.prediction(None, hidden)
+        if label > self._blank_id:
+            label -= 1
+        label = torch.tensor([[label]], dtype=torch.int64)
+        return self._model.prediction(label, hidden)
+
+    def _joint_step(self, enc: torch.Tensor, pred: torch.Tensor, log_normalize: bool=False) -> torch.Tensor:
+        logits = self._model.joint(enc, pred)[:, 0, 0, :]
+        if not log_normalize:
+            return logits
+
+        probs = F.log_softmax(logits, dim=len(logits.shape) - 1)
+
+        return probs
+
+    def _get_last_symb(self, labels: List[int]) -> int:
+        return self._SOS if len(labels) == 0 else labels[-1]
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/helpers.py b/benchmarks/rnnt/ootb/inference/pytorch/helpers.py
new file mode 100644
index 0000000..cfe3b66
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/helpers.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from metrics import word_error_rate
+
+
+class Optimization(Enum):
+    """Various levels of Optimization.
+    WARNING: This might have effect on model accuracy."""
+    nothing = 0
+    mxprO0 = 1
+    mxprO1 = 2
+    mxprO2 = 3
+    mxprO3 = 4
+
+
+AmpOptimizations = {Optimization.mxprO0: "O0",
+                    Optimization.mxprO1: "O1",
+                    Optimization.mxprO2: "O2",
+                    Optimization.mxprO3: "O3"}
+
+
+def add_blank_label(labels):
+    if not isinstance(labels, list):
+        raise ValueError("labels must be a list of symbols")
+    labels.append("<BLANK>")
+    return labels
+
+
+def __rnnt_decoder_predictions_tensor(tensor, labels):
+    """
+    Takes output of greedy rnnt decoder and converts to strings.
+    Args:
+        tensor: model output tensor
+        label: A list of labels
+    Returns:
+        prediction
+    """
+    hypotheses = []
+    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
+    # iterate over batch
+    for ind in range(len(tensor)):
+        hypothesis = ''.join([labels_map[c] for c in tensor[ind]])
+        hypotheses.append(hypothesis)
+    return hypotheses
+
+
+def __gather_predictions(predictions_list: list, labels: list) -> list:
+    results = []
+    for prediction in predictions_list:
+        results += __rnnt_decoder_predictions_tensor(prediction, labels=labels)
+    return results
+
+
+def __gather_transcripts(transcript_list: list, transcript_len_list: list,
+                         labels: list) -> list:
+    results = []
+    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
+    for i, t in enumerate(transcript_list):
+        target = t.numpy().tolist()
+        reference = ''.join([labels_map[c] for c in target])
+        results.append(reference)
+    return results
+
+
+def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list):
+    """
+    Processes results of an iteration and saves it in global_vars
+    Args:
+        tensors: dictionary with results of an evaluation iteration, e.g. loss, predictions, transcript, and output
+        global_vars: dictionary where processes results of iteration are saved
+        labels: A list of labels
+    """
+    for kv, v in tensors.items():
+        if kv.startswith('predictions'):
+            global_vars['predictions'] += __gather_predictions(
+                v, labels=labels)
+        elif kv.startswith('transcript_length'):
+            transcript_len_list = v
+        elif kv.startswith('transcript'):
+            transcript_list = v
+
+    global_vars['transcripts'] += __gather_transcripts(transcript_list,
+                                                       transcript_len_list,
+                                                       labels=labels)
+
+
+def process_evaluation_epoch(global_vars: dict, tag=None):
+    """
+    Processes results from each worker at the end of evaluation and combine to final result
+    Args:
+        global_vars: dictionary containing information of entire evaluation
+    Return:
+        wer: final word error rate
+        loss: final loss
+    """
+    hypotheses = global_vars['predictions']
+    references = global_vars['transcripts']
+
+    wer, scores, num_words = word_error_rate(
+        hypotheses=hypotheses, references=references)
+    return wer
+
+
+def print_dict(d):
+    maxLen = max([len(ii) for ii in d.keys()])
+    fmtString = '\t%' + str(maxLen) + 's : %s'
+    print('Arguments:')
+    for keyPair in sorted(d.items()):
+        print(fmtString % keyPair)
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/metrics.py b/benchmarks/rnnt/ootb/inference/pytorch/metrics.py
new file mode 100644
index 0000000..5426e37
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/metrics.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+
+def __levenshtein(a: List, b: List) -> int:
+    """Calculates the Levenshtein distance between a and b.
+    """
+    n, m = len(a), len(b)
+    if n > m:
+        # Make sure n <= m, to use O(min(n,m)) space
+        a, b = b, a
+        n, m = m, n
+
+    current = list(range(n + 1))
+    for i in range(1, m + 1):
+        previous, current = current, [i] + [0] * n
+        for j in range(1, n + 1):
+            add, delete = previous[j] + 1, current[j - 1] + 1
+            change = previous[j - 1]
+            if a[j - 1] != b[i - 1]:
+                change = change + 1
+            current[j] = min(add, delete, change)
+
+    return current[n]
+
+
+def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
+    """
+    Computes Average Word Error rate between two texts represented as
+    corresponding lists of string. Hypotheses and references must have same length.
+
+    Args:
+        hypotheses: list of hypotheses
+        references: list of references
+
+    Returns:
+        (float) average word error rate
+    """
+    scores = 0
+    words = 0
+    if len(hypotheses) != len(references):
+        raise ValueError("In word error rate calculation, hypotheses and reference"
+                         " lists must have the same number of elements. But I got:"
+                         "{0} and {1} correspondingly".format(len(hypotheses), len(references)))
+    for h, r in zip(hypotheses, references):
+        h_list = h.split()
+        r_list = r.split()
+        words += len(r_list)
+        scores += __levenshtein(h_list, r_list)
+    if words != 0:
+        wer = (1.0 * scores) / words
+    else:
+        wer = float('inf')
+    return wer, scores, words
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/model_separable_rnnt.py b/benchmarks/rnnt/ootb/inference/pytorch/model_separable_rnnt.py
new file mode 100644
index 0000000..68a0ed6
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/model_separable_rnnt.py
@@ -0,0 +1,214 @@
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+
+from rnn import rnn
+from rnn import StackTime
+
+
+class RNNT(torch.nn.Module):
+    def __init__(self, rnnt=None, num_classes=1, **kwargs):
+        super().__init__()
+        if kwargs.get("no_featurizer", False):
+            in_features = kwargs.get("in_features")
+        else:
+            feat_config = kwargs.get("feature_config")
+            # This may be useful in the future, for MLPerf
+            # configuration.
+            in_features = feat_config['features'] * \
+                feat_config.get("frame_splicing", 1)
+
+        self.encoder = Encoder(in_features,
+            rnnt["encoder_n_hidden"],
+            rnnt["encoder_pre_rnn_layers"],
+            rnnt["encoder_post_rnn_layers"],
+            rnnt["forget_gate_bias"],
+            None if "norm" not in rnnt else rnnt["norm"],
+            rnnt["rnn_type"],
+            rnnt["encoder_stack_time_factor"],
+            rnnt["dropout"],
+        )
+
+        self.prediction = Prediction(
+            num_classes,
+            rnnt["pred_n_hidden"],
+            rnnt["pred_rnn_layers"],
+            rnnt["forget_gate_bias"],
+            None if "norm" not in rnnt else rnnt["norm"],
+            rnnt["rnn_type"],
+            rnnt["dropout"],
+        )
+
+        self.joint = Joint(
+            num_classes,
+            rnnt["pred_n_hidden"],
+            rnnt["encoder_n_hidden"],
+            rnnt["joint_n_hidden"],
+            rnnt["dropout"],
+        )
+
+    def forward(self, x_padded: torch.Tensor, x_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.encoder(x_padded, x_lens)
+        
+
+class Encoder(torch.nn.Module):
+    def __init__(self, in_features, encoder_n_hidden,
+                 encoder_pre_rnn_layers, encoder_post_rnn_layers,
+                 forget_gate_bias, norm, rnn_type, encoder_stack_time_factor,
+                 dropout):
+        super().__init__()
+        self.pre_rnn = rnn(
+            rnn=rnn_type,
+            input_size=in_features,
+            hidden_size=encoder_n_hidden,
+            num_layers=encoder_pre_rnn_layers,
+            norm=norm,
+            forget_gate_bias=forget_gate_bias,
+            dropout=dropout,
+        )
+        self.stack_time = StackTime(factor=encoder_stack_time_factor)
+        self.post_rnn = rnn(
+            rnn=rnn_type,
+            input_size=encoder_stack_time_factor * encoder_n_hidden,
+            hidden_size=encoder_n_hidden,
+            num_layers=encoder_post_rnn_layers,
+            norm=norm,
+            forget_gate_bias=forget_gate_bias,
+            norm_first_rnn=True,
+            dropout=dropout,
+        )
+
+    def forward(self, x_padded: torch.Tensor, x_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x_padded, _ = self.pre_rnn(x_padded, None)
+        x_padded, x_lens = self.stack_time(x_padded, x_lens)
+        # (T, B, H)
+        x_padded, _ = self.post_rnn(x_padded, None)
+        # (B, T, H)
+        x_padded = x_padded.transpose(0, 1)
+        return x_padded, x_lens
+
+class Prediction(torch.nn.Module):
+    def __init__(self, vocab_size, n_hidden, pred_rnn_layers,
+                 forget_gate_bias, norm, rnn_type, dropout):
+        super().__init__()
+        self.embed = torch.nn.Embedding(vocab_size - 1, n_hidden)
+        self.n_hidden = n_hidden
+        self.dec_rnn = rnn(
+            rnn=rnn_type,
+            input_size=n_hidden,
+            hidden_size=n_hidden,
+            num_layers=pred_rnn_layers,
+            norm=norm,
+            forget_gate_bias=forget_gate_bias,
+            dropout=dropout,
+        )
+
+    def forward(self, y: Optional[torch.Tensor],
+                state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        B - batch size
+        U - label length
+        H - Hidden dimension size
+        L - Number of decoder layers = 2
+
+        Args:
+            y: (B, U)
+
+        Returns:
+            Tuple (g, hid) where:
+                g: (B, U + 1, H)
+                hid: (h, c) where h is the final sequence hidden state and c is
+                    the final cell state:
+                        h (tensor), shape (L, B, H)
+                        c (tensor), shape (L, B, H)
+        """
+        if y is None:
+            # This is gross. I should really just pass in an SOS token
+            # instead. Is there no SOS token?
+            assert state is None
+            # Hacky, no way to determine this right now!
+            B = 1
+            y = torch.zeros((B, 1, self.n_hidden), dtype=torch.float32)
+        else:
+            y = self.embed(y)
+
+        # if state is None:
+        #    batch = y.size(0)
+        #    state = [
+        #        (torch.zeros(batch, self.pred_n_hidden, dtype=y.dtype, device=y.device),
+        #         torch.zeros(batch, self.pred_n_hidden, dtype=y.dtype, device=y.device))
+        #        for _ in range(self.pred_rnn_layers)
+        #    ]
+
+        y = y.transpose(0, 1)  # .contiguous()   # (U + 1, B, H)
+        g, hid = self.dec_rnn(y, state)
+        g = g.transpose(0, 1)  # .contiguous()   # (B, U + 1, H)
+        # del y, state
+        return g, hid
+
+class Joint(torch.nn.Module):
+    def __init__(self, vocab_size, pred_n_hidden, enc_n_hidden,
+                 joint_n_hidden, dropout):
+        super().__init__()
+        layers = [
+            torch.nn.Linear(pred_n_hidden + enc_n_hidden, joint_n_hidden),
+            torch.nn.ReLU(),
+        ] + ([torch.nn.Dropout(p=dropout), ] if dropout else []) + [
+            torch.nn.Linear(joint_n_hidden, vocab_size)
+        ]
+        self.net = torch.nn.Sequential(
+            *layers
+        )
+
+    def forward(self, f: torch.Tensor, g: torch.Tensor):
+        """
+        f should be shape (B, T, H)
+        g should be shape (B, U + 1, H)
+
+        returns:
+            logits of shape (B, T, U, K + 1)
+        """
+        # Combine the input states and the output states
+        B, T, H = f.shape
+        B, U_, H2 = g.shape
+
+        f = f.unsqueeze(dim=2)   # (B, T, 1, H)
+        f = f.expand((B, T, U_, H))
+
+        g = g.unsqueeze(dim=1)   # (B, 1, U + 1, H)
+        g = g.expand((B, T, U_, H2))
+
+        inp = torch.cat([f, g], dim=3)   # (B, T, U, 2H)
+        res = self.net(inp)
+        # del f, g, inp
+        return res
+
+def label_collate(labels):
+    """Collates the label inputs for the rnn-t prediction network.
+
+    If `labels` is already in torch.Tensor form this is a no-op.
+
+    Args:
+        labels: A torch.Tensor List of label indexes or a torch.Tensor.
+
+    Returns:
+        A padded torch.Tensor of shape (batch, max_seq_len).
+    """
+
+    if isinstance(labels, torch.Tensor):
+        return labels.type(torch.int64)
+    if not isinstance(labels, (list, tuple)):
+        raise ValueError(
+            f"`labels` should be a list or tensor not {type(labels)}"
+        )
+
+    batch_size = len(labels)
+    max_len = max(len(l) for l in labels)
+
+    cat_labels = np.full((batch_size, max_len), fill_value=0.0, dtype=np.int32)
+    for e, l in enumerate(labels):
+        cat_labels[e, :len(l)] = l
+    labels = torch.LongTensor(cat_labels)
+
+    return labels
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/preprocessing.py b/benchmarks/rnnt/ootb/inference/pytorch/preprocessing.py
new file mode 100644
index 0000000..5818854
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/preprocessing.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+from helpers import Optimization
+from parts.features import FeatureFactory
+
+
+class AudioPreprocessing(nn.Module):
+    """GPU accelerated audio preprocessing
+    """
+
+    def __init__(self, **kwargs):
+        nn.Module.__init__(self)    # For PyTorch API
+        self.optim_level = kwargs.get(
+            'optimization_level', Optimization.nothing)
+        self.featurizer = FeatureFactory.from_config(kwargs)
+
+    def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        input_signal, length = x
+        length.requires_grad_(False)
+        processed_signal = self.featurizer(x)
+        processed_length = self.featurizer.get_seq_len(length)
+        return processed_signal, processed_length
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/rnn.py b/benchmarks/rnnt/ootb/inference/pytorch/rnn.py
new file mode 100644
index 0000000..39e2121
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/rnn.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from typing import Optional, Tuple
+
+
+def rnn(rnn, input_size, hidden_size, num_layers, norm=None,
+        forget_gate_bias=1.0, dropout=0.0, **kwargs):
+    """TODO"""
+    if rnn != "lstm":
+        raise ValueError(f"Unknown rnn={rnn}")
+    if norm not in [None]:
+        raise ValueError(f"unknown norm={norm}")
+
+    if rnn == "lstm":
+        return LstmDrop(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+            forget_gate_bias=forget_gate_bias,
+            **kwargs
+        )
+
+
+class LstmDrop(torch.nn.Module):
+
+    def __init__(self, input_size, hidden_size, num_layers, dropout, forget_gate_bias,
+                 **kwargs):
+        """Returns an LSTM with forget gate bias init to `forget_gate_bias`.
+
+        Args:
+            input_size: See `torch.nn.LSTM`.
+            hidden_size: See `torch.nn.LSTM`.
+            num_layers: See `torch.nn.LSTM`.
+            dropout: See `torch.nn.LSTM`.
+            forget_gate_bias: For each layer and each direction, the total value of
+                to initialise the forget gate bias to.
+
+        Returns:
+            A `torch.nn.LSTM`.
+        """
+        super(LstmDrop, self).__init__()
+
+        self.lstm = torch.nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+        )
+        if forget_gate_bias is not None:
+            for name, v in self.lstm.named_parameters():
+                if "bias_ih" in name:
+                    bias = getattr(self.lstm, name)
+                    bias.data[hidden_size:2 * hidden_size].fill_(forget_gate_bias)
+                if "bias_hh" in name:
+                    bias = getattr(self.lstm, name)
+                    bias.data[hidden_size:2 * hidden_size].fill_(0)
+
+        if dropout:
+            self.inplace_dropout = torch.nn.Dropout(dropout, inplace=True)
+        else:
+            self.inplace_droput = None
+
+    def forward(self, x: torch.Tensor,
+                h: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
+        x, h = self.lstm(x, h)
+
+        if self.inplace_dropout is not None:
+            self.inplace_dropout(x.data)
+
+        return x, h
+
+
+class StackTime(torch.nn.Module):
+
+    __constants__ = ["factor"]
+
+    def __init__(self, factor):
+        super().__init__()
+        self.factor = int(factor)
+
+    def forward(self, x, x_lens):
+        # T, B, U
+        r = torch.transpose(x, 0, 1)
+        s = r.shape
+        zeros = torch.zeros(
+            s[0], (-s[1]) % self.factor, s[2], dtype=r.dtype, device=r.device)
+        r = torch.cat([r, zeros], 1)
+        s = r.shape
+        rs = [s[0], s[1] // self.factor, s[2] * self.factor]
+        r = torch.reshape(r, rs)
+        rt = torch.transpose(r, 0, 1)
+        x_lens = torch.ceil(x_lens.float() / self.factor).int()
+        return rt, x_lens
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/docker/build.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/docker/build.sh
new file mode 100755
index 0000000..cfdc97c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/docker/build.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker build . --rm -t jasper
\ No newline at end of file
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/docker/launch.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/docker/launch.sh
new file mode 100755
index 0000000..5c9c6a3
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/docker/launch.sh
@@ -0,0 +1,32 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+
+DATA_DIR=$1
+CHECKPOINT_DIR=$2
+RESULT_DIR=$3
+
+docker run -it --rm \
+  --gpus='"device=1"' \
+  --shm-size=4g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  -v "$DATA_DIR":/datasets \
+  -v "$CHECKPOINT_DIR":/checkpoints/ \
+  -v "$RESULT_DIR":/results/ \
+  -v $PWD:/code \
+  -v $PWD:/workspace/jasper \
+  mlperf-rnnt-ref bash
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/download_librispeech.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/download_librispeech.sh
new file mode 100755
index 0000000..ee322fe
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/download_librispeech.sh
@@ -0,0 +1,28 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/usr/bin/env bash
+
+DATA_SET="LibriSpeech"
+DATA_ROOT_DIR="/datasets"
+DATA_DIR="${DATA_ROOT_DIR}/${DATA_SET}"
+if [ ! -d "$DATA_DIR" ]
+then
+    mkdir $DATA_DIR
+    chmod go+rx $DATA_DIR
+    python utils/download_librispeech.py utils/librispeech.csv $DATA_DIR -e ${DATA_ROOT_DIR}/
+else
+    echo "Directory $DATA_DIR already exists."
+fi
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/evaluation.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/evaluation.sh
new file mode 100755
index 0000000..fcd472f
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/evaluation.sh
@@ -0,0 +1,92 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+DATA_DIR=${1:-"/datasets/LibriSpeech"}
+DATASET=${2:-"dev-clean"}
+MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
+RESULT_DIR=${4:-"/results"}
+CHECKPOINT=$5
+CREATE_LOGFILE=${6:-"true"}
+CUDNN_BENCHMARK=${7:-"false"}
+NUM_GPUS=${8:-1}
+PRECISION=${9:-"fp32"}
+NUM_STEPS=${10:-"-1"}
+SEED=${11:-0}
+BATCH_SIZE=${12:-64}
+
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+    export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+    printf -v TAG "jasper_evaluation_${DATASET}_%s_gbs%d" "$PRECISION" $GBS
+    DATESTAMP=`date +'%y%m%d%H%M%S'`
+    LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+    printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+    PREC="--fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+    PREC=""
+else
+    echo "Unknown <precision> argument"
+    exit -2
+fi
+
+STEPS=""
+if [ "$NUM_STEPS" -gt 0 ] ; then
+    STEPS=" --steps $NUM_STEPS"
+fi
+
+if [ "$CUDNN_BENCHMARK" = "true" ] ; then
+    CUDNN_BENCHMARK=" --cudnn_benchmark"
+else
+    CUDNN_BENCHMARK=""
+fi
+
+
+CMD=" inference.py "
+CMD+=" --batch_size $BATCH_SIZE "
+CMD+=" --dataset_dir $DATA_DIR "
+CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
+CMD+=" --model_toml $MODEL_CONFIG  "
+CMD+=" --seed $SEED "
+CMD+=" --ckpt $CHECKPOINT "
+CMD+=" $CUDNN_BENCHMARK"
+CMD+=" $PREC "
+CMD+=" $STEPS "
+
+
+if [ "$NUM_GPUS" -gt 1  ] ; then
+    CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
+else
+    CMD="python3  $CMD"
+fi
+
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+fi
+set +x
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/inference.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/inference.sh
new file mode 100755
index 0000000..2d4474c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/inference.sh
@@ -0,0 +1,104 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+
+DATA_DIR=${1-"/datasets/LibriSpeech"}
+DATASET=${2:-"dev-clean"}
+MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
+RESULT_DIR=${4:-"/results"}
+CHECKPOINT=$5
+CREATE_LOGFILE=${6:-"true"}
+CUDNN_BENCHMARK=${7:-"false"}
+PRECISION=${8:-"fp32"}
+NUM_STEPS=${9:-"-1"}
+SEED=${10:-0}
+BATCH_SIZE=${11:-64}
+MODELOUTPUT_FILE=${12:-"none"}
+PREDICTION_FILE=${13:-"$RESULT_DIR/${DATASET}.predictions"}
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+    export GBS=$(expr $BATCH_SIZE)
+    printf -v TAG "jasper_inference_${DATASET}_%s_gbs%d" "$PRECISION" $GBS
+    DATESTAMP=`date +'%y%m%d%H%M%S'`
+    LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+    printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+    PREC="--fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+    PREC=""
+else
+    echo "Unknown <precision> argument"
+    exit -2
+fi
+
+PRED=""
+if [ "$PREDICTION_FILE" = "none" ] ; then
+    PRED=""
+else
+    PRED=" --save_prediction $PREDICTION_FILE"
+fi
+
+OUTPUT=""
+if [ "$MODELOUTPUT_FILE" = "none" ] ; then
+    OUTPUT=" "
+else
+    OUTPUT=" --logits_save_to $MODELOUTPUT_FILE"
+fi
+
+
+if [ "$CUDNN_BENCHMARK" = "true" ]; then
+    CUDNN_BENCHMARK=" --cudnn_benchmark"
+else
+    CUDNN_BENCHMARK=""
+fi
+
+STEPS=""
+if [ "$NUM_STEPS" -gt 0 ] ; then
+    STEPS=" --steps $NUM_STEPS"
+fi
+
+CMD=" python inference.py "
+CMD+=" --batch_size $BATCH_SIZE "
+CMD+=" --dataset_dir $DATA_DIR "
+CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
+CMD+=" --model_toml $MODEL_CONFIG  "
+CMD+=" --seed $SEED "
+CMD+=" --ckpt $CHECKPOINT "
+CMD+=" $CUDNN_BENCHMARK"
+CMD+=" $PRED "
+CMD+=" $OUTPUT "
+CMD+=" $PREC "
+CMD+=" $STEPS "
+
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+fi
+set +x
+echo "MODELOUTPUT_FILE: ${MODELOUTPUT_FILE}"
+echo "PREDICTION_FILE: ${PREDICTION_FILE}"
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/inference_benchmark.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/inference_benchmark.sh
new file mode 100755
index 0000000..7aeea84
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/inference_benchmark.sh
@@ -0,0 +1,84 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+
+DATA_DIR=${1:-"/datasets/LibriSpeech"}
+DATASET=${2:-"dev-clean"}
+MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
+RESULT_DIR=${4:-"/results"}
+CHECKPOINT=$5
+CREATE_LOGFILE=${6:-"true"}
+CUDNN_BENCHMARK=${7:-"true"}
+PRECISION=${8:-"fp32"}
+NUM_STEPS=${9:-"-1"}
+MAX_DURATION=${10:-"36"}
+SEED=${11:-0}
+BATCH_SIZE=${12:-64}
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+STEPS=""
+if [ "$NUM_STEPS" -gt 0 ] ; then
+   STEPS=" --steps $NUM_STEPS"
+fi
+if [ "$CUDNN_BENCHMARK" = "true" ] ; then
+    CUDNN_BENCHMARK=" --cudnn_benchmark"
+else
+    CUDNN_BENCHMARK=""
+fi
+
+CMD=" python inference_benchmark.py"
+CMD+=" --batch_size=$BATCH_SIZE"
+CMD+=" --model_toml=$MODEL_CONFIG"
+CMD+=" --seed=$SEED"
+CMD+=" --dataset_dir=$DATA_DIR"
+CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
+CMD+=" --ckpt=$CHECKPOINT"
+CMD+=" --max_duration=$MAX_DURATION"
+CMD+=" --pad_to=-1"
+CMD+=" $CUDNN_BENCHMARK"
+CMD+=" $PREC"
+CMD+=" $STEPS"
+
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+  export GBS=$(expr $BATCH_SIZE )
+  printf -v TAG "jasper_inference_benchmark_%s_gbs%d" "$PRECISION" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+   grep 'latency' "$LOGFILE"
+fi
+set +x
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/preprocess_librispeech.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/preprocess_librispeech.sh
new file mode 100755
index 0000000..7cfe5cc
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/preprocess_librispeech.sh
@@ -0,0 +1,51 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-clean-100 \
+    --dest_dir /datasets/LibriSpeech/train-clean-100-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-clean-100-wav.json \
+    --speed 0.9 1.1
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-clean-360 \
+    --dest_dir /datasets/LibriSpeech/train-clean-360-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-clean-360-wav.json \
+    --speed 0.9 1.1
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/train-other-500 \
+    --dest_dir /datasets/LibriSpeech/train-other-500-wav \
+    --output_json /datasets/LibriSpeech/librispeech-train-other-500-wav.json \
+    --speed 0.9 1.1
+
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/dev-clean \
+    --dest_dir /datasets/LibriSpeech/dev-clean-wav \
+    --output_json /datasets/LibriSpeech/librispeech-dev-clean-wav.json
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/dev-other \
+    --dest_dir /datasets/LibriSpeech/dev-other-wav \
+    --output_json /datasets/LibriSpeech/librispeech-dev-other-wav.json
+
+
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/test-clean \
+    --dest_dir /datasets/LibriSpeech/test-clean-wav \
+    --output_json /datasets/LibriSpeech/librispeech-test-clean-wav.json
+python ./utils/convert_librispeech.py \
+    --input_dir /datasets/LibriSpeech/test-other \
+    --dest_dir /datasets/LibriSpeech/test-other-wav \
+    --output_json /datasets/LibriSpeech/librispeech-test-other-wav.json
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/train.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/train.sh
new file mode 100755
index 0000000..d59ce8e
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/train.sh
@@ -0,0 +1,113 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+DATA_DIR=${1:-"/datasets/LibriSpeech"}
+MODEL_CONFIG=${2:-"configs/rnnt.toml"}
+RESULT_DIR=${3:-"/results"}
+CHECKPOINT=${4:-"none"}
+CREATE_LOGFILE=${5:-"true"}
+CUDNN_BENCHMARK=${6:-"true"}
+NUM_GPUS=${7:-8}
+PRECISION=${8:-"fp16"}
+EPOCHS=${9:-100}
+SEED=${10:-6}
+BATCH_SIZE=${11:-8}
+EVAL_BATCH_SIZE=${11:-2}
+LEARNING_RATE=${12:-"0.001"}
+LEARNING_RATE_WARMUP=${12:-"8000"}
+GRADIENT_ACCUMULATION_STEPS=${13:-1}
+LAUNCH_OPT=${LAUNCH_OPT:-"none"}
+
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+CUDNN=""
+if [ "$CUDNN_BENCHMARK" = "true" ] && [ "$PRECISION" = "fp16" ]; then
+   CUDNN=" --cudnn"
+else
+   CUDNN=""
+fi
+
+
+
+if [ "$CHECKPOINT" = "none" ] ; then
+   CHECKPOINT=""
+else
+   CHECKPOINT=" --ckpt=${CHECKPOINT}"
+fi
+
+
+CMD=" train.py"
+CMD+=" --batch_size=$BATCH_SIZE"
+CMD+=" --eval_batch_size=$EVAL_BATCH_SIZE"
+CMD+=" --num_epochs=$EPOCHS"
+CMD+=" --output_dir=$RESULT_DIR"
+CMD+=" --model_toml=$MODEL_CONFIG"
+CMD+=" --lr=$LEARNING_RATE"
+CMD+=" --lr_warmup=$LEARNING_RATE_WARMUP"
+CMD+=" --seed=$SEED"
+CMD+=" --optimizer=adam"
+CMD+=" --dataset_dir=$DATA_DIR"
+CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
+CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json"
+CMD+=" --weight_decay=1e-3"
+CMD+=" --save_freq=100"
+CMD+=" --eval_freq=1"
+CMD+=" --train_freq=250"
+CMD+=" --lr_decay"
+CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS "
+CMD+=" $CHECKPOINT"
+CMD+=" $PREC"
+CMD+=" $CUDNN"
+
+
+if [ "${LAUNCH_OPT}" != "none" ]; then
+   CMD="python -m $LAUNCH_OPT $CMD"
+elif [ "$NUM_GPUS" -gt 1  ] ; then
+   CMD="python3 -m multiproc --nproc_per_node=$NUM_GPUS $CMD"
+else
+   CMD="python3  $CMD"
+fi
+
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+  export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+  printf -v TAG "rnnt_train_%s_gbs%d" "$PRECISION" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE=$RESULT_DIR/$TAG.$DATESTAMP.log
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+set +x
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/scripts/train_benchmark.sh b/benchmarks/rnnt/ootb/inference/pytorch/scripts/train_benchmark.sh
new file mode 100755
index 0000000..7b5a337
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/scripts/train_benchmark.sh
@@ -0,0 +1,130 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+DATA_DIR=${1:-"/datasets/LibriSpeech"}
+MODEL_CONFIG=${2:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
+RESULT_DIR=${3:-"/results"}
+CREATE_LOGFILE=${4:-"true"}
+CUDNN_BENCHMARK=${5:-"true"}
+NUM_GPUS=${6:-8}
+PRECISION=${7:-"fp16"}
+NUM_STEPS=${8:-"-1"}
+MAX_DURATION=${9:-16.7}
+SEED=${10:-0}
+BATCH_SIZE=${11:-64}
+LEARNING_RATE=${12:-"0.015"}
+GRADIENT_ACCUMULATION_STEPS=${13:-1}
+PRINT_FREQUENCY=${14:-1}
+
+
+PREC=""
+if [ "$PRECISION" = "fp16" ] ; then
+   PREC=" --fp16"
+elif [ "$PRECISION" = "fp32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+STEPS=""
+if [ "$NUM_STEPS" -ne "-1" ] ; then
+   STEPS=" --num_steps=$NUM_STEPS"
+elif [ "$NUM_STEPS" = "-1" ] ; then
+   STEPS=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+CUDNN=""
+if [ "$CUDNN_BENCHMARK" = "true" ] ; then
+   CUDNN=" --cudnn"
+else
+   CUDNN=""
+fi
+
+
+CMD=" train.py"
+CMD+=" --batch_size=$BATCH_SIZE"
+CMD+=" --num_epochs=400"
+CMD+=" --output_dir=$RESULT_DIR"
+CMD+=" --model_toml=$MODEL_CONFIG"
+CMD+=" --lr=$LEARNING_RATE"
+CMD+=" --seed=$SEED"
+CMD+=" --optimizer=novograd"
+CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS"
+CMD+=" --dataset_dir=$DATA_DIR"
+CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
+CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json"
+CMD+=" --weight_decay=1e-3"
+CMD+=" --save_freq=100000"
+CMD+=" --eval_freq=100000"
+CMD+=" --max_duration=$MAX_DURATION"
+CMD+=" --pad_to_max"
+CMD+=" --train_freq=$PRINT_FREQUENCY"
+CMD+=" --lr_decay"
+CMD+=" $CUDNN"
+CMD+=" $PREC"
+CMD+=" $STEPS"
+
+if [ "$NUM_GPUS" -gt 1  ] ; then
+   CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
+else
+   CMD="python3  $CMD"
+fi
+
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+  export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+  printf -v TAG "jasper_train_benchmark_%s_gbs%d" "$PRECISION" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+  printf "Logs written to %s\n" "$LOGFILE"
+
+fi
+
+if [ -z "$LOGFILE" ] ; then
+
+   set -x
+   $CMD
+   set +x
+else
+
+   set -x
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+
+   set +x
+
+   mean_latency=`cat "$LOGFILE" | grep 'Step time' | awk '{print $3}'  | tail -n +2 | egrep -o '[0-9.]+'| awk 'BEGIN {total=0} {total+=$1} END {printf("%.2f\n",total/NR)}'`
+   mean_throughput=`python -c "print($BATCH_SIZE*$NUM_GPUS/${mean_latency})"`
+   training_wer_per_pgu=`cat "$LOGFILE" | grep 'training_batch_WER'| awk '{print $2}'  | tail -n 1 | egrep -o '[0-9.]+'`
+   training_loss_per_pgu=`cat "$LOGFILE" | grep 'Loss@Step'| awk '{print $4}'  | tail -n 1 | egrep -o '[0-9.]+'`
+   final_eval_wer=`cat "$LOGFILE" | grep 'Evaluation WER'| tail -n 1 | egrep -o '[0-9.]+'`
+   final_eval_loss=`cat "$LOGFILE" | grep 'Evaluation Loss'| tail -n 1 | egrep -o '[0-9.]+'`
+
+   echo "max duration: $MAX_DURATION s" | tee -a "$LOGFILE"
+   echo "mean_latency: $mean_latency s" | tee -a "$LOGFILE"
+   echo "mean_throughput: $mean_throughput sequences/s" | tee -a "$LOGFILE"
+   echo "training_wer_per_pgu: $training_wer_per_pgu" | tee -a "$LOGFILE"
+   echo "training_loss_per_pgu: $training_loss_per_pgu" | tee -a "$LOGFILE"
+   echo "final_eval_loss: $final_eval_loss" | tee -a "$LOGFILE"
+   echo "final_eval_wer: $final_eval_wer" | tee -a "$LOGFILE"
+fi
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/utils/__init__.py b/benchmarks/rnnt/ootb/inference/pytorch/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/utils/convert_librispeech.py b/benchmarks/rnnt/ootb/inference/pytorch/utils/convert_librispeech.py
new file mode 100644
index 0000000..e90076c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/utils/convert_librispeech.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import os
+import glob
+import multiprocessing
+import json
+
+import pandas as pd
+
+from preprocessing_utils import parallel_preprocess
+
+parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.')
+parser.add_argument('--input_dir', type=str, required=True,
+                    help='LibriSpeech collection input dir')
+parser.add_argument('--dest_dir', type=str, required=True,
+                    help='Output dir')
+parser.add_argument('--output_json', type=str, default='./',
+                    help='name of the output json file.')
+parser.add_argument('-s', '--speed', type=float, nargs='*',
+                    help='Speed perturbation ratio')
+parser.add_argument('--target_sr', type=int, default=None,
+                    help='Target sample rate. '
+                         'defaults to the input sample rate')
+parser.add_argument('--overwrite', action='store_true',
+                    help='Overwrite file if exists')
+parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(),
+                    help='Number of threads to use when processing audio files')
+args = parser.parse_args()
+
+args.input_dir = args.input_dir.rstrip('/')
+args.dest_dir = args.dest_dir.rstrip('/')
+
+
+def build_input_arr(input_dir):
+    txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
+                          recursive=True)
+    input_data = []
+    for txt_file in txt_files:
+        rel_path = os.path.relpath(txt_file, input_dir)
+        with open(txt_file) as fp:
+            for line in fp:
+                fname, _, transcript = line.partition(' ')
+                input_data.append(dict(input_relpath=os.path.dirname(rel_path),
+                                       input_fname=fname + '.flac',
+                                       transcript=transcript))
+    return input_data
+
+
+print("[%s] Scaning input dir..." % args.output_json)
+dataset = build_input_arr(input_dir=args.input_dir)
+
+print("[%s] Converting audio files..." % args.output_json)
+dataset = parallel_preprocess(dataset=dataset,
+                              input_dir=args.input_dir,
+                              dest_dir=args.dest_dir,
+                              target_sr=args.target_sr,
+                              speed=args.speed,
+                              overwrite=args.overwrite,
+                              parallel=args.parallel)
+
+print("[%s] Generating json..." % args.output_json)
+df = pd.DataFrame(dataset, dtype=object)
+
+# Save json with python. df.to_json() produces back slashed in file paths
+dataset = df.to_dict(orient='records')
+with open(args.output_json, 'w') as fp:
+    json.dump(dataset, fp, indent=2)
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/utils/download_librispeech.py b/benchmarks/rnnt/ootb/inference/pytorch/utils/download_librispeech.py
new file mode 100644
index 0000000..f7e5eda
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/utils/download_librispeech.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import argparse
+import pandas as pd
+
+from download_utils import download_file, md5_checksum, extract
+
+parser = argparse.ArgumentParser(
+    description='Download, verify and extract dataset files')
+parser.add_argument('csv', type=str,
+                    help='CSV file with urls and checksums to download.')
+parser.add_argument('dest', type=str,
+                    help='Download destnation folder.')
+parser.add_argument('-e', type=str, default=None,
+                    help='Extraction destnation folder. Defaults to download folder if not provided')
+parser.add_argument('--skip_download', action='store_true',
+                    help='Skip downloading the files')
+parser.add_argument('--skip_checksum', action='store_true',
+                    help='Skip checksum')
+parser.add_argument('--skip_extract', action='store_true',
+                    help='Skip extracting files')
+args = parser.parse_args()
+args.e = args.e or args.dest
+
+
+df = pd.read_csv(args.csv, delimiter=',')
+
+
+if not args.skip_download:
+    for url in df.url:
+        fname = url.split('/')[-1]
+        print("Downloading %s:" % fname)
+        download_file(url=url, dest_folder=args.dest, fname=fname)
+else:
+    print("Skipping file download")
+
+
+if not args.skip_checksum:
+    for index, row in df.iterrows():
+        url = row['url']
+        md5 = row['md5']
+        fname = url.split('/')[-1]
+        fpath = os.path.join(args.dest, fname)
+        print("Verifing %s: " % fname, end='')
+        ret = md5_checksum(fpath=fpath, target_hash=md5)
+        if not ret:
+            raise ValueError(f"Checksum for {fname} failed!")
+        else:
+            print(f"Checksum correct for {fname}")
+else:
+    print("Skipping checksum")
+
+
+if not args.skip_extract:
+    for url in df.url:
+        fname = url.split('/')[-1]
+        fpath = os.path.join(args.dest, fname)
+        print("Decompressing %s:" % fpath)
+        extract(fpath=fpath, dest_folder=args.e)
+else:
+    print("Skipping file extraction")
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/utils/download_utils.py b/benchmarks/rnnt/ootb/inference/pytorch/utils/download_utils.py
new file mode 100644
index 0000000..bda4193
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/utils/download_utils.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import hashlib
+import requests
+import os
+import tarfile
+import tqdm
+
+
+def download_file(url, dest_folder, fname, overwrite=False):
+    fpath = os.path.join(dest_folder, fname)
+    if os.path.isfile(fpath):
+        if overwrite:
+            print("Overwriting existing file")
+        else:
+            print("File exists, skipping download.")
+            return
+
+    tmp_fpath = fpath + '.tmp'
+
+    r = requests.get(url, stream=True)
+    file_size = int(r.headers['Content-Length'])
+    chunk_size = 1024 * 1024  # 1MB
+    total_chunks = int(file_size / chunk_size)
+
+    with open(tmp_fpath, 'wb') as fp:
+        content_iterator = r.iter_content(chunk_size=chunk_size)
+        chunks = tqdm.tqdm(content_iterator, total=total_chunks,
+                           unit='MB', desc=fpath, leave=True)
+        for chunk in chunks:
+            fp.write(chunk)
+
+    os.rename(tmp_fpath, fpath)
+
+
+def md5_checksum(fpath, target_hash):
+    file_hash = hashlib.md5()
+    with open(fpath, "rb") as fp:
+        for chunk in iter(lambda: fp.read(1024 * 1024), b""):
+            file_hash.update(chunk)
+    return file_hash.hexdigest() == target_hash
+
+
+def extract(fpath, dest_folder):
+    if fpath.endswith('.tar.gz'):
+        mode = 'r:gz'
+    elif fpath.endswith('.tar'):
+        mode = 'r:'
+    else:
+        raise IOError('fpath has unknown extention: %s' % fpath)
+
+    with tarfile.open(fpath, mode) as tar:
+        members = tar.getmembers()
+        for member in tqdm.tqdm(iterable=members, total=len(members), leave=True):
+            tar.extract(path=dest_folder, member=member)
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/utils/inference_librispeech.csv b/benchmarks/rnnt/ootb/inference/pytorch/utils/inference_librispeech.csv
new file mode 100644
index 0000000..40dac4e
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/utils/inference_librispeech.csv
@@ -0,0 +1,5 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
+http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
+http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
+http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/utils/librispeech-inference.csv b/benchmarks/rnnt/ootb/inference/pytorch/utils/librispeech-inference.csv
new file mode 100644
index 0000000..b5e43b2
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/utils/librispeech-inference.csv
@@ -0,0 +1,2 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
\ No newline at end of file
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/utils/librispeech.csv b/benchmarks/rnnt/ootb/inference/pytorch/utils/librispeech.csv
new file mode 100644
index 0000000..d48a9f8
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/utils/librispeech.csv
@@ -0,0 +1,8 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
+http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
+http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
+http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
+http://www.openslr.org/resources/12/train-clean-100.tar.gz,2a93770f6d5c6c964bc36631d331a522
+http://www.openslr.org/resources/12/train-clean-360.tar.gz,c0e676e450a7ff2f54aeade5171606fa
+http://www.openslr.org/resources/12/train-other-500.tar.gz,d1a0fd59409feb2c614ce4d30c387708
diff --git a/benchmarks/rnnt/ootb/inference/pytorch/utils/preprocessing_utils.py b/benchmarks/rnnt/ootb/inference/pytorch/utils/preprocessing_utils.py
new file mode 100644
index 0000000..260e860
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch/utils/preprocessing_utils.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import multiprocessing
+import functools
+
+import sox
+
+
+from tqdm import tqdm
+
+
+def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None,
+               overwrite=True):
+    speed = speed or []
+    speed.append(1)
+    speed = list(set(speed))  # Make uniqe
+
+    input_fname = os.path.join(input_dir,
+                               data['input_relpath'],
+                               data['input_fname'])
+    input_sr = sox.file_info.sample_rate(input_fname)
+    target_sr = target_sr or input_sr
+
+    os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True)
+
+    output_dict = {}
+    output_dict['transcript'] = data['transcript'].lower().strip()
+    output_dict['files'] = []
+
+    fname = os.path.splitext(data['input_fname'])[0]
+    for s in speed:
+        output_fname = fname + \
+            '{}.wav'.format('' if s == 1 else '-{}'.format(s))
+        output_fpath = os.path.join(dest_dir,
+                                    data['input_relpath'],
+                                    output_fname)
+
+        if not os.path.exists(output_fpath) or overwrite:
+            cbn = sox.Transformer().speed(factor=s).convert(target_sr)
+            cbn.build(input_fname, output_fpath)
+
+        file_info = sox.file_info.info(output_fpath)
+        file_info['fname'] = os.path.join(os.path.basename(dest_dir),
+                                          data['input_relpath'],
+                                          output_fname)
+        file_info['speed'] = s
+        output_dict['files'].append(file_info)
+
+        if s == 1:
+            file_info = sox.file_info.info(output_fpath)
+            output_dict['original_duration'] = file_info['duration']
+            output_dict['original_num_samples'] = file_info['num_samples']
+
+    return output_dict
+
+
+def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel):
+    with multiprocessing.Pool(parallel) as p:
+        func = functools.partial(preprocess,
+                                 input_dir=input_dir, dest_dir=dest_dir,
+                                 target_sr=target_sr, speed=speed, overwrite=overwrite)
+        dataset = list(tqdm(p.imap(func, dataset), total=len(dataset)))
+        return dataset
diff --git a/benchmarks/rnnt/ootb/inference/pytorch_SUT.py b/benchmarks/rnnt/ootb/inference/pytorch_SUT.py
new file mode 100644
index 0000000..5695479
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/pytorch_SUT.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2020, Cerebras Systems, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+sys.path.insert(0, os.path.join(os.getcwd(), "pytorch"))
+
+import array
+import torch
+import numpy as np
+import toml
+import mlperf_loadgen as lg
+from tqdm import tqdm
+
+from QSL import AudioQSL, AudioQSLInMemory
+from decoders import ScriptGreedyDecoder
+from helpers import add_blank_label
+from preprocessing import AudioPreprocessing
+from model_separable_rnnt import RNNT
+
+
+def load_and_migrate_checkpoint(ckpt_path):
+    checkpoint = torch.load(ckpt_path, map_location="cpu")
+    migrated_state_dict = {}
+    for key, value in checkpoint['state_dict'].items():
+        key = key.replace("joint_net", "joint.net")
+        migrated_state_dict[key] = value
+    del migrated_state_dict["audio_preprocessor.featurizer.fb"]
+    del migrated_state_dict["audio_preprocessor.featurizer.window"]
+    return migrated_state_dict
+
+
+class PytorchSUT:
+    def __init__(self, config_toml, checkpoint_path, dataset_dir,
+                 manifest_filepath, perf_count):
+        config = toml.load(config_toml)
+
+        dataset_vocab = config['labels']['labels']
+        rnnt_vocab = add_blank_label(dataset_vocab)
+        featurizer_config = config['input_eval']
+
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries,
+                                   self.process_latencies)
+        self.qsl = AudioQSLInMemory(dataset_dir,
+                                    manifest_filepath,
+                                    dataset_vocab,
+                                    featurizer_config["sample_rate"],
+                                    perf_count)
+        self.audio_preprocessor = AudioPreprocessing(**featurizer_config)
+        self.audio_preprocessor.eval()
+        self.audio_preprocessor = torch.jit.script(self.audio_preprocessor)
+        self.audio_preprocessor = torch.jit._recursive.wrap_cpp_module(
+            torch._C._freeze_module(self.audio_preprocessor._c))
+
+        model = RNNT(
+            feature_config=featurizer_config,
+            rnnt=config['rnnt'],
+            num_classes=len(rnnt_vocab)
+        )
+        model.load_state_dict(load_and_migrate_checkpoint(checkpoint_path),
+                              strict=True)
+        model.eval()
+        model.encoder = torch.jit.script(model.encoder)
+        model.encoder = torch.jit._recursive.wrap_cpp_module(
+            torch._C._freeze_module(model.encoder._c))
+        model.prediction = torch.jit.script(model.prediction)
+        model.prediction = torch.jit._recursive.wrap_cpp_module(
+            torch._C._freeze_module(model.prediction._c))
+        model.joint = torch.jit.script(model.joint)
+        model.joint = torch.jit._recursive.wrap_cpp_module(
+            torch._C._freeze_module(model.joint._c))
+        model = torch.jit.script(model)
+
+        self.greedy_decoder = ScriptGreedyDecoder(len(rnnt_vocab) - 1, model)
+
+    def issue_queries(self, query_samples):
+        for query_sample in query_samples:
+            waveform = self.qsl[query_sample.index]
+            assert waveform.ndim == 1
+            waveform_length = np.array(waveform.shape[0], dtype=np.int64)
+            waveform = np.expand_dims(waveform, 0)
+            waveform_length = np.expand_dims(waveform_length, 0)
+            with torch.no_grad():
+                waveform = torch.from_numpy(waveform)
+                waveform_length = torch.from_numpy(waveform_length)
+                feature, feature_length = self.audio_preprocessor.forward((waveform, waveform_length))
+                assert feature.ndim == 3
+                assert feature_length.ndim == 1
+                feature = feature.permute(2, 0, 1)
+
+                _, _, transcript = self.greedy_decoder.forward(feature, feature_length)
+
+            assert len(transcript) == 1
+            response_array = array.array('q', transcript[0])
+            bi = response_array.buffer_info()
+            response = lg.QuerySampleResponse(query_sample.id, bi[0],
+                                              bi[1] * response_array.itemsize)
+            lg.QuerySamplesComplete([response])
+
+    def flush_queries(self):
+        pass
+
+    def process_latencies(self, latencies_ns):
+        print("Average latency (ms) per query:")
+        print(np.mean(latencies_ns)/1000000.0)
+        print("Median latency (ms): ")
+        print(np.percentile(latencies_ns, 50)/1000000.0)
+        print("90 percentile latency (ms): ")
+        print(np.percentile(latencies_ns, 90)/1000000.0)
+
+    def __del__(self):
+        lg.DestroySUT(self.sut)
+        print("Finished destroying SUT.")
diff --git a/benchmarks/rnnt/ootb/inference/run.py b/benchmarks/rnnt/ootb/inference/run.py
new file mode 100644
index 0000000..4f688fd
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/run.py
@@ -0,0 +1,110 @@
+# Copyright 2020 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import argparse
+import mlperf_loadgen as lg
+import subprocess
+
+import os
+from pathlib import Path
+import sys
+
+MLPERF_CONF = Path(os.path.dirname(os.path.realpath(__file__))) / "../../mlperf.conf"
+MLPERF_CONF = MLPERF_CONF.resolve()
+
+# FB5 Logger
+p = Path(__file__).parent.resolve() / "../../../../fb5logging"
+sys.path.append(os.fspath(p))
+from fb5logger import FB5Logger
+import loggerconstants
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--backend", choices=["pytorch"], default="pytorch", help="Backend")
+    parser.add_argument("--scenario", choices=["SingleStream", "Offline", "Server"], default="Offline", help="Scenario")
+    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
+    parser.add_argument("--mlperf_conf", default=str(MLPERF_CONF), help="mlperf rules config")
+    parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS")
+    parser.add_argument("--pytorch_config_toml", default="pytorch/configs/rnnt.toml")
+    parser.add_argument("--pytorch_checkpoint", default="pytorch/work_dir/rnnt.pt")
+    parser.add_argument("--dataset_dir", required=True)
+    parser.add_argument("--manifest", required=True)
+    parser.add_argument("--perf_count", type=int, default=None)
+    parser.add_argument("--log_dir", required=True)
+    # FB5 Logging
+    parser.add_argument("--fb5logger", type=str, default=None)
+    parser.add_argument("--fb5config", type=str, default="small")
+    args = parser.parse_args()
+    return args
+
+
+scenario_map = {
+    "SingleStream": lg.TestScenario.SingleStream,
+    "Offline": lg.TestScenario.Offline,
+    "Server": lg.TestScenario.Server,
+}
+
+
+def main():
+    args = get_args()
+
+    if args.fb5logger is not None:
+        fb5logger = FB5Logger(args.fb5logger)
+        fb5logger.header("RNN-T", "OOTB", "infer", args.fb5config, score_metric=loggerconstants.EXPS)
+
+    if args.backend == "pytorch":
+        from pytorch_SUT import PytorchSUT
+        sut = PytorchSUT(args.pytorch_config_toml, args.pytorch_checkpoint,
+                         args.dataset_dir, args.manifest, args.perf_count)
+    else:
+        raise ValueError("Unknown backend: {:}".format(args.backend))
+
+    settings = lg.TestSettings()
+    settings.scenario = scenario_map[args.scenario]
+    settings.FromConfig(args.mlperf_conf, "rnnt", args.scenario)
+    settings.FromConfig(args.user_conf, "rnnt", args.scenario)
+
+    if args.accuracy:
+        settings.mode = lg.TestMode.AccuracyOnly
+    else:
+        settings.mode = lg.TestMode.PerformanceOnly
+
+    log_path = args.log_dir
+    os.makedirs(log_path, exist_ok=True)
+    log_output_settings = lg.LogOutputSettings()
+    log_output_settings.outdir = log_path
+    log_output_settings.copy_summary_to_stdout = True
+    log_settings = lg.LogSettings()
+    log_settings.log_output = log_output_settings
+
+    print("Running Loadgen test...")
+    if args.fb5logger is not None:
+        fb5logger.run_start()
+    lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings, log_settings)
+    if args.fb5logger is not None:
+        nbatches = sut.qsl.count
+        fb5logger.run_stop(nbatches, 1)
+
+    if args.accuracy:
+        cmd = f"python3 accuracy_eval.py --log_dir {log_path} --dataset_dir {args.dataset_dir} --manifest {args.manifest}"
+        print(f"Running accuracy script: {cmd}")
+        subprocess.check_call(cmd, shell=True)
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/rnnt/ootb/inference/run.sh b/benchmarks/rnnt/ootb/inference/run.sh
new file mode 100755
index 0000000..7538df9
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/run.sh
@@ -0,0 +1,90 @@
+#/bin/bash
+
+set -euo pipefail
+
+work_dir=/export/b07/ws15dgalvez/mlperf-rnnt-librispeech
+local_data_dir=$work_dir/local_data
+librispeech_download_dir=$local_data_dir/LibriSpeech
+stage=3
+
+mkdir -p $work_dir $local_data_dir $librispeech_download_dir
+
+install_dir=third_party/install
+mkdir -p $install_dir
+install_dir=$(readlink -f $install_dir)
+
+set +u
+source "$($CONDA_EXE info --base)/etc/profile.d/conda.sh"
+set -u
+
+# stage -1: install dependencies
+if [[ $stage -le -1 ]]; then
+    conda env create --force -v --file environment.yml
+
+    set +u
+    source "$(conda info --base)/etc/profile.d/conda.sh"
+    conda activate mlperf-rnnt
+    set -u
+
+    # We need to convert .flac files to .wav files via sox. Not all sox installs have flac support, so we install from source.
+    wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz -O third_party/flac-1.3.2.tar.xz
+    (cd third_party; tar xf flac-1.3.2.tar.xz; cd flac-1.3.2; ./configure --prefix=$install_dir && make && make install)
+
+    wget https://sourceforge.net/projects/sox/files/sox/14.4.2/sox-14.4.2.tar.gz -O third_party/sox-14.4.2.tar.gz
+    (cd third_party; tar zxf sox-14.4.2.tar.gz; cd sox-14.4.2; LDFLAGS="-L${install_dir}/lib" CFLAGS="-I${install_dir}/include" ./configure --prefix=$install_dir --with-flac && make && make install)
+
+    (cd $(git rev-parse --show-toplevel)/loadgen; python setup.py install)
+fi
+
+export PATH="$install_dir/bin/:$PATH"
+
+set +u
+conda activate mlperf-rnnt
+set -u
+
+# stage 0: download model. Check checksum to skip?
+if [[ $stage -le 0 ]]; then
+  wget https://zenodo.org/record/3662521/files/DistributedDataParallel_1576581068.9962234-epoch-100.pt?download=1 -O $work_dir/rnnt.pt
+fi
+
+# stage 1: download data. This will hae a non-zero exit code if the
+# checksum is incorrect.
+if [[ $stage -le 1 ]]; then
+  python pytorch/utils/download_librispeech.py \
+         pytorch/utils/librispeech-inference.csv \
+         $librispeech_download_dir \
+         -e $local_data_dir
+fi
+
+if [[ $stage -le 2 ]]; then
+  python pytorch/utils/convert_librispeech.py \
+      --input_dir $librispeech_download_dir/dev-clean \
+      --dest_dir $local_data_dir/dev-clean-wav \
+      --output_json $local_data_dir/dev-clean-wav.json
+fi
+
+if [[ $stage -le 3 ]]; then
+  for backend in pytorch; do
+    for accuracy in "--accuracy" ""; do
+      for scenario in SingleStream Offline Server; do
+        log_dir=${work_dir}/${scenario}_${backend}
+        if [ ! -z ${accuracy} ]; then
+          log_dir+=_accuracy
+        fi
+        log_dir+=rerun
+
+        python run.py --backend pytorch \
+               --dataset_dir $local_data_dir \
+               --manifest $local_data_dir/dev-clean-wav.json \
+               --pytorch_config_toml pytorch/configs/rnnt.toml \
+               --pytorch_checkpoint $work_dir/rnnt.pt \
+               --scenario ${scenario} \
+               --backend ${backend} \
+               --log_dir ${log_dir} \
+               ${accuracy} &
+
+      done
+    done
+  done
+  wait
+fi
diff --git a/benchmarks/rnnt/ootb/inference/third_party/pybind b/benchmarks/rnnt/ootb/inference/third_party/pybind
new file mode 160000
index 0000000..b11ff91
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/third_party/pybind
@@ -0,0 +1 @@
+Subproject commit b11ff912a6b68edcd67770308ba4703e3a740e7b
diff --git a/benchmarks/rnnt/ootb/inference/user.conf b/benchmarks/rnnt/ootb/inference/user.conf
new file mode 100644
index 0000000..545569c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/inference/user.conf
@@ -0,0 +1,6 @@
+# Please set these fields depending on the performance of your system to
+# override default LoadGen settings.
+*.SingleStream.target_latency = 10
+*.Server.target_qps = 1.0
+*.Offline.target_qps = 1.0
+*.MultiStream.samples_per_query = 4
\ No newline at end of file
diff --git a/benchmarks/rnnt/ootb/train/Dockerfile b/benchmarks/rnnt/ootb/train/Dockerfile
new file mode 100755
index 0000000..cb13d98
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/Dockerfile
@@ -0,0 +1,56 @@
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.0-cuda11.0-cudnn8-devel
+FROM ${FROM_IMAGE_NAME}
+
+ENV PYTORCH_VERSION=1.7.0a0+7036e91 
+
+RUN apt-get update && \
+    apt-get install -y libsndfile1 sox git cmake jq && \
+    apt-get install -y --no-install-recommends numactl && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN COMMIT_SHA=f546575109111c455354861a0567c8aa794208a2 && \
+    git clone https://github.com/HawkAaron/warp-transducer deps/warp-transducer && \
+    cd deps/warp-transducer && \
+    git checkout $COMMIT_SHA && \
+    sed -i 's/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")/#set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")/g' CMakeLists.txt && \
+    sed -i 's/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")/g' CMakeLists.txt && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make VERBOSE=1 && \
+    export CUDA_HOME="/usr/local/cuda" && \
+    export WARP_RNNT_PATH=`pwd` && \
+    export CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME && \
+    export LD_LIBRARY_PATH="$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH" && \
+    export LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH && \
+    export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH && \
+    export CFLAGS="-I$CUDA_HOME/include $CFLAGS" && \
+    cd ../pytorch_binding && \
+    python3 setup.py install && \
+    rm -rf ../tests test ../tensorflow_binding && \
+    cd ../../..
+
+WORKDIR /workspace/rnnt
+
+RUN pip install --no-cache --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda110==0.28.0
+
+RUN pip install --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/8a1ed9e8d35dfad26fb973996319965e4224dcdd.zip
+
+COPY requirements.txt .
+RUN pip install --no-cache --disable-pip-version-check -U -r requirements.txt
+
+COPY . .
diff --git a/benchmarks/rnnt/ootb/train/LICENSE b/benchmarks/rnnt/ootb/train/LICENSE
new file mode 100644
index 0000000..f2f7693
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/LICENSE
@@ -0,0 +1,204 @@
+   Except where otherwise noted, the following license applies to all files in this repo. 
+        
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019-2020 NVIDIA Corporation
+   Copyright 2019 Myrtle Software Limited, www.myrtle.ai
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/benchmarks/rnnt/ootb/train/NOTICE b/benchmarks/rnnt/ootb/train/NOTICE
new file mode 100644
index 0000000..9d3b1ff
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/NOTICE
@@ -0,0 +1,5 @@
+RNN-T in PyTorch
+
+This repository includes source code (in "rnnt/") from:
+* https://github.com/keithito/tacotron and https://github.com/ryanleary/patter licensed under MIT license.
+
diff --git a/benchmarks/rnnt/ootb/train/README.md b/benchmarks/rnnt/ootb/train/README.md
new file mode 100644
index 0000000..1793e83
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/README.md
@@ -0,0 +1,192 @@
+# 1. Problem 
+Speech recognition accepts raw audio samples and produces a corresponding text transcription.
+
+# 2. Directions
+
+## Steps to configure machine
+### From Docker
+1. Clone the repository
+```
+git clone https://github.com/mlcommon/training.git
+```
+2. Install CUDA and Docker
+```
+source training/install_cuda_docker.sh
+```
+3. Build the docker image for the single stage detection task
+```
+# Build from Dockerfile
+cd training/rnn_speech_recognition/pytorch/
+bash scripts/docker/build.sh
+```
+
+#### Requirements
+Currently, the reference uses CUDA-11.0 (see [Dockerfile](Dockerfile#L15)).
+Here you can find a table listing compatible drivers: https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver
+
+## Steps to download data
+1. Start an interactive session in the NGC container to run data download/training/inference
+```
+bash scripts/docker/launch.sh <DATA_DIR> <CHECKPOINT_DIR> <RESULTS_DIR>
+```
+
+Within the container, the contents of this repository will be copied to the `/workspace/rnnt` directory. The `/datasets`, `/checkpoints`, `/results` directories are mounted as volumes
+and mapped to the corresponding directories `<DATA_DIR>`, `<CHECKPOINT_DIR>`, `<RESULT_DIR>` on the host.
+
+2. Download and preprocess the dataset.
+
+No GPU is required for data download and preprocessing. Therefore, if GPU usage is a limited resource, launch the container for this section on a CPU machine by following prevoius steps.
+
+Note: Downloading and preprocessing the dataset requires 500GB of free disk space and can take several hours to complete.
+
+This repository provides scripts to download, and extract the following datasets:
+
+* LibriSpeech [http://www.openslr.org/12](http://www.openslr.org/12)
+
+LibriSpeech contains 1000 hours of 16kHz read English speech derived from public domain audiobooks from LibriVox project and has been carefully segmented and aligned. For more information, see the [LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS](http://www.danielpovey.com/files/2015_icassp_librispeech.pdf) paper.
+
+Inside the container, download and extract the datasets into the required format for later training and inference:
+```bash
+bash scripts/download_librispeech.sh
+```
+Once the data download is complete, the following folders should exist:
+
+* `/datasets/LibriSpeech/`
+   * `train-clean-100/`
+   * `train-clean-360/`
+   * `train-other-500/`
+   * `dev-clean/`
+   * `dev-other/`
+   * `test-clean/`
+   * `test-other/`
+
+Since `/datasets/` is mounted to `<DATA_DIR>` on the host (see Step 3),  once the dataset is downloaded it will be accessible from outside of the container at `<DATA_DIR>/LibriSpeech`.
+
+Next, convert the data into WAV files:
+```bash
+bash scripts/preprocess_librispeech.sh
+```
+Once the data is converted, the following additional files and folders should exist:
+* `datasets/LibriSpeech/`
+   * `librispeech-train-clean-100-wav.json`
+   * `librispeech-train-clean-360-wav.json`
+   * `librispeech-train-other-500-wav.json`
+   * `librispeech-dev-clean-wav.json`
+   * `librispeech-dev-other-wav.json`
+   * `librispeech-test-clean-wav.json`
+   * `librispeech-test-other-wav.json`
+   * `train-clean-100-wav/`
+   * `train-clean-360-wav/`
+   * `train-other-500-wav/`
+   * `dev-clean-wav/`
+   * `dev-other-wav/`
+   * `test-clean-wav/`
+   * `test-other-wav/`
+
+For training, the following manifest files are used:
+   * `librispeech-train-clean-100-wav.json`
+   * `librispeech-train-clean-360-wav.json`
+   * `librispeech-train-other-500-wav.json`
+
+For evaluation, the `librispeech-dev-clean-wav.json` is used.
+
+## Steps to run benchmark.
+
+### Steps to launch training
+
+Inside the container, use the following script to start training.
+Make sure the downloaded and preprocessed dataset is located at `<DATA_DIR>/LibriSpeech` on the host (see Step 3), which corresponds to `/datasets/LibriSpeech` inside the container.
+
+```bash
+bash scripts/train.sh
+```
+
+This script tries to use 8 GPUs by default.
+To run 1-gpu training, use the following command:
+
+```bash
+NUM_GPUS=1 GRAD_ACCUMULATION_STEPS=64 scripts/train.sh
+```
+
+# 3. Dataset/Environment
+### Publication/Attribution
+["OpenSLR LibriSpeech Corpus"](http://www.openslr.org/12/) provides over 1000 hours of speech data in the form of raw audio.
+
+### Data preprocessing
+Data preprocessing is described by scripts mentioned in the [Steps to download data](#steps-to-download-data).
+
+### Data pipeline
+Transcripts are encoded to sentencepieces using model produced in [Steps to download data](#steps-to-download-data).
+Audio processing consists of the following steps:
+1. audio is decoded with sample rate choosen uniformly between 13800 and 18400 ([code](./common/data/dali/pipeline.py#L91-L97));
+2. silience is trimmed with -60 dB threshold (datails in the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/archives/dali_0280/user-guide/docs/supported_ops.html?highlight=nonsilentregion#nvidia.dali.ops.NonsilentRegion)) ([code](./common/data/dali/pipeline.py#L120-L121));
+3. random noise with normal distribution and 0.00001 amplitude is applied to reduce quantization effect (dither) ([code](/common/data/dali/pipeline.py#L197));
+4. Pre-emphasis filter is applied (details in the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/archives/dali_0280/user-guide/docs/supported_ops.html?highlight=nonsilentregion#nvidia.dali.ops.PreemphasisFilter) ([code](./common/data/dali/pipeline.py#L101));
+1. spectograms are calculated with 512 ffts, 20ms window and 10ms stride ([code](./common/data/dali/pipeline.py#L103-L105));
+1. MelFilterBanks are calculated with 80 features and normalization ([code](./common/data/dali/pipeline.py#L107-L108));
+1. features are translated to decibeles with log(10) multiplier reference magnitude 1 and 1e-20 cutoff (details in the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/archives/dali_0280/user-guide/docs/supported_ops.html?highlight=nonsilentregion#nvidia.dali.ops.ToDecibels)) ([code](./common/data/dali/pipeline.py#L110-L111));
+1. features are normalized along time dimension using algorithm described in the [normalize operator documentation](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/examples/general/normalize.html) ([code](common/data/dali/pipeline.py#L115));
+1. In the train pipeline, an adaptive specaugment augmentation is applied ([arxiv](https://arxiv.org/abs/1912.05533), [code](https://github.com/mwawrzos/training/blob/rnnt/rnn_speech_recognition/pytorch/common/data/features.py#L44-L117)). In the evaluation pipeline, this step is omitted;
+1. to reduce accelerator memory usage, frames are spliced (stacked three times, and subsampled three times) ([code](https://github.com/mwawrzos/training/blob/rnnt/rnn_speech_recognition/pytorch/common/data/features.py#L144-L165));
+
+### Training and test data separation
+Dataset authors separated it to test and training subsets. For this benchmark, training is done on train-clean-100, train-clean-360 and train-other-500 subsets. Evaluation is done on dev-clean subset.
+
+### Training data order
+To reduce data padding in minibatches, data bucketing is applied.
+The algorithm is implemented here:
+[link](https://github.com/mlcommons/training/blob/2126999a1ffff542064bb3208650a1e673920dcf/rnn_speech_recognition/pytorch/common/data/dali/sampler.py#L65-L105)
+and can be described as follows:
+1. drop samples longer than a given threshold ([code](./common/data/dali/data_loader.py#L97-L98));
+1. sort data by audio length ([code](./common/data/dali/sampler.py#L69));
+2. split data into 6 equally sized buckets ([code](./common/data/dali/sampler.py#L70));
+3. for every epochs:
+    1. shuffle data in each bucket ([code](common/data/dali/sampler.py#L73-L78));
+    2. as long as all samples are not divisible by global batch size, remove random element from random bucket ([code](./common/data/dali/sampler.py#L82-L86));
+    3. concatenate all buckets;
+    4. split samples into minibatches ([code](./common/data/dali/sampler.py#L90));
+    5. shuffle minibatches in the epoch ([code](./common/data/dali/sampler.py#L93-L94)).
+
+### Test data order
+Test data order is the same as in the dataset.
+
+# 4. Model
+### Publication/Attribution
+To the best of our knowledge, there is no single publication describing RNN-T training on LibriSpeech,
+or another publicly available dataset of reasonable size. For that reason, the reference will be a
+collection of solutions from several works. It is based on the following articles:
+* Graves 2012 - an invention of RNN-Transducer: https://arxiv.org/abs/1211.3711
+* Rao 2018 - time reduction in the acoustic model, internal dataset: https://arxiv.org/abs/1801.00841
+* Zhang 2020 - Transformer-transducer publication. It includes bi-directional LSTM RNN-T result on LibriSpeech: https://arxiv.org/abs/2002.02562
+* Park 2019 - adaptive spec augment, internal dataset: https://arxiv.org/abs/1912.05533
+* Guo 2020 - RNN-T trained with vanilla LSTM, internal dataset: https://arxiv.org/abs/2007.13802
+
+### List of layers 
+Model structure is described in the following picture:
+![model layers structure](./rnnt_layers.svg "RNN-T model structure")
+
+### Weight and bias initialization
+* In all fully connected layers, weights and biases are initialized as defined in the [Pytorch 1.7.0 torch.nn.Linear documentation](https://pytorch.org/docs/1.7.0/generated/torch.nn.Linear.html#torch.nn.Linear) ([code](./rnnt/model.py#L123-L137)).
+* In the embeding layer, weights are initialized as defined in the [Pytorch 1.7.0 torch.nn.Embeding documentation](https://pytorch.org/docs/1.7.0/generated/torch.nn.Embedding.html#torch.nn.Embedding) ([code](./rnnt/model.py#L105)).
+* In all LSTM layers:
+    * weights and biases are initialized as defined in the [Pytorch 1.7.0 torch.nn.LSTM documentation](https://pytorch.org/docs/1.7.0/generated/torch.nn.LSTM.html#torch.nn.LSTM) ([code](./common/rnn.py#L56-L61)),
+    * forget gate biases are set to 1 ([code](./common/rnn.py#L67-L69)),
+    * then the weights and bias values are divided by two (in result, the forget gate biases are set to 0.5) ([code](./common/rnn.py#L74-L76)).
+
+### Loss function
+Transducer Loss 
+### Optimizer
+RNN-T benchmark uses LAMB optimizer. More details are in [training policies](https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc#appendix-allowed-optimizers).
+
+To decrease the number of epochs needed to reach the target accuracy,
+evaluation is done with an exponential moving average of the trained model weights with a smoothing factor set to 0.999.
+
+# 5. Quality
+### Quality metric
+Word Error Rate (WER) across all words in the output text of all samples in the validation set.
+### Quality target
+Target quality is 0.058 Word Error Rate or lower.
+### Evaluation frequency
+Evaluation is done after each training epoch.
+### Evaluation thoroughness
+Evaluation is done on each sample from the evaluation set.
diff --git a/benchmarks/rnnt/ootb/train/common/__init__.py b/benchmarks/rnnt/ootb/train/common/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/rnnt/ootb/train/common/audio.py b/benchmarks/rnnt/ootb/train/common/audio.py
new file mode 100644
index 0000000..d515832
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/audio.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import soundfile as sf
+
+import librosa
+import torch
+import numpy as np
+
+import sox
+
+
+def audio_from_file(file_path, offset=0, duration=0, trim=False, target_sr=16000):
+    audio = AudioSegment(file_path, target_sr=target_sr, int_values=False,
+                         offset=offset, duration=duration, trim=trim)
+
+    samples = torch.tensor(audio.samples, dtype=torch.float).cuda()
+    num_samples = torch.tensor(samples.shape[0]).int().cuda()
+    return (samples.unsqueeze(0), num_samples.unsqueeze(0))
+
+
+class AudioSegment(object):
+    """Monaural audio segment abstraction.
+
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :raises TypeError: If the sample data type is not float or int.
+    """
+
+    def __init__(self, filename, target_sr=None, int_values=False, offset=0,
+                 duration=0, trim=False, trim_db=60):
+        """Create audio segment from samples.
+
+        Samples are converted to float32 internally, with int scaled to [-1, 1].
+        Load a file supported by librosa and return as an AudioSegment.
+        :param filename: path of file to load
+        :param target_sr: the desired sample rate
+        :param int_values: if true, load samples as 32-bit integers
+        :param offset: offset in seconds when loading audio
+        :param duration: duration in seconds when loading audio
+        :return: numpy array of samples
+        """
+        with sf.SoundFile(filename, 'r') as f:
+            dtype = 'int32' if int_values else 'float32'
+            sample_rate = f.samplerate
+            if offset > 0:
+                f.seek(int(offset * sample_rate))
+            if duration > 0:
+                samples = f.read(int(duration * sample_rate), dtype=dtype)
+            else:
+                samples = f.read(dtype=dtype)
+        samples = samples.transpose()
+
+        samples = self._convert_samples_to_float32(samples)
+        if target_sr is not None and target_sr != sample_rate:
+            samples = librosa.core.resample(samples, sample_rate, target_sr)
+            sample_rate = target_sr
+        if trim:
+            samples, _ = librosa.effects.trim(samples, trim_db)
+        self._samples = samples
+        self._sample_rate = sample_rate
+        if self._samples.ndim >= 2:
+            self._samples = np.mean(self._samples, 1)
+
+    def __eq__(self, other):
+        """Return whether two objects are equal."""
+        if type(other) is not type(self):
+            return False
+        if self._sample_rate != other._sample_rate:
+            return False
+        if self._samples.shape != other._samples.shape:
+            return False
+        if np.any(self.samples != other._samples):
+            return False
+        return True
+
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+
+    def __str__(self):
+        """Return human-readable representation of segment."""
+        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
+                        "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
+                                                        self.duration, self.rms_db))
+
+    @staticmethod
+    def _convert_samples_to_float32(samples):
+        """Convert sample type to float32.
+
+        Audio sample type is usually integer or float-point.
+        Integers will be scaled to [-1, 1] in float32.
+        """
+        float32_samples = samples.astype('float32')
+        if samples.dtype in np.sctypes['int']:
+            bits = np.iinfo(samples.dtype).bits
+            float32_samples *= (1. / 2 ** (bits - 1))
+        elif samples.dtype in np.sctypes['float']:
+            pass
+        else:
+            raise TypeError("Unsupported sample type: %s." % samples.dtype)
+        return float32_samples
+
+    @property
+    def samples(self):
+        return self._samples.copy()
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+    @property
+    def num_samples(self):
+        return self._samples.shape[0]
+
+    @property
+    def duration(self):
+        return self._samples.shape[0] / float(self._sample_rate)
+
+    @property
+    def rms_db(self):
+        mean_square = np.mean(self._samples ** 2)
+        return 10 * np.log10(mean_square)
+
+    def gain_db(self, gain):
+        self._samples *= 10. ** (gain / 20.)
+
+    def pad(self, pad_size, symmetric=False):
+        """Add zero padding to the sample.
+
+        The pad size is given in number of samples. If symmetric=True,
+        `pad_size` will be added to both sides. If false, `pad_size` zeros
+        will be added only to the end.
+        """
+        self._samples = np.pad(self._samples,
+                               (pad_size if symmetric else 0, pad_size),
+                               mode='constant')
+
+    def subsegment(self, start_time=None, end_time=None):
+        """Cut the AudioSegment between given boundaries.
+
+        Note that this is an in-place transformation.
+        :param start_time: Beginning of subsegment in seconds.
+        :type start_time: float
+        :param end_time: End of subsegment in seconds.
+        :type end_time: float
+        :raise ValueError: If start_time or end_time is incorrectly set, e.g. out
+                                             of bounds in time.
+        """
+        start_time = 0.0 if start_time is None else start_time
+        end_time = self.duration if end_time is None else end_time
+        if start_time < 0.0:
+            start_time = self.duration + start_time
+        if end_time < 0.0:
+            end_time = self.duration + end_time
+        if start_time < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start_time)
+        if end_time < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end_time)
+        if start_time > end_time:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the end position (%f s)." % (start_time, end_time))
+        if end_time > self.duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end_time, self.duration))
+        start_sample = int(round(start_time * self._sample_rate))
+        end_sample = int(round(end_time * self._sample_rate))
+        self._samples = self._samples[start_sample:end_sample]
+
+
+class Perturbation:
+    def __init__(self, p=0.1, rng=None):
+        self.p = p
+        self._rng = random.Random() if rng is None else rng
+
+    def maybe_apply(self, segment, sample_rate=None):
+        if self._rng.random() < self.p:
+            self(segment, sample_rate)
+
+
+class SpeedPerturbation(Perturbation):
+    def __init__(self, min_rate=0.85, max_rate=1.15, discrete=False, p=0.1, rng=None):
+        super(SpeedPerturbation, self).__init__(p, rng)
+        assert 0 < min_rate < max_rate
+        self.min_rate = min_rate
+        self.max_rate = max_rate
+        self.discrete = discrete
+
+    def __call__(self, data, sample_rate):
+        if self.discrete:
+            rate = np.random.choice([self.min_rate, None, self.max_rate])
+        else:
+            rate = self._rng.uniform(self.min_rate, self.max_rate)
+
+        if rate is not None:
+            data._samples = sox.Transformer().speed(factor=rate).build_array(
+                input_array=data._samples, sample_rate_in=sample_rate)
+
diff --git a/benchmarks/rnnt/ootb/train/common/data/__init__.py b/benchmarks/rnnt/ootb/train/common/data/__init__.py
new file mode 100644
index 0000000..b9211a7
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/__init__.py
@@ -0,0 +1 @@
+from .helpers import *
diff --git a/benchmarks/rnnt/ootb/train/common/data/dali/__init__.py b/benchmarks/rnnt/ootb/train/common/data/dali/__init__.py
new file mode 100644
index 0000000..ff80003
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/dali/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/benchmarks/rnnt/ootb/train/common/data/dali/data_loader.py b/benchmarks/rnnt/ootb/train/common/data/dali/data_loader.py
new file mode 100644
index 0000000..733bade
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/dali/data_loader.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import numpy as np
+import torch.distributed as dist
+from .iterator import DaliRnntIterator
+from .pipeline import DaliPipeline
+from common.helpers import print_once
+
+
+def _parse_json(json_path: str, start_label=0, predicate=lambda json: True):
+    """
+    Parses json file to the format required by DALI
+    Args:
+        json_path: path to json file
+        start_label: the label, starting from which DALI will assign consecutive int numbers to every transcript
+        predicate: function, that accepts a sample descriptor (i.e. json dictionary) as an argument.
+                   If the predicate for a given sample returns True, it will be included in the dataset.
+
+    Returns:
+        output_files: dictionary, that maps file name to label assigned by DALI
+        transcripts: dictionary, that maps label assigned by DALI to the transcript
+    """
+    import json
+    global cnt
+    with open(json_path) as f:
+        librispeech_json = json.load(f)
+    output_files = {}
+    transcripts = {}
+    curr_label = start_label
+    for original_sample in librispeech_json:
+        if not predicate(original_sample):
+            continue
+        transcripts[curr_label] = original_sample['transcript']
+        output_files[original_sample['files'][-1]['fname']] = dict(
+            label=curr_label,
+            duration=original_sample['original_duration'],
+        )
+        curr_label += 1
+    return output_files, transcripts
+
+
+class DaliDataLoader:
+    """
+    DataLoader is the main entry point to the data preprocessing pipeline.
+    To use, create an object and then just iterate over `data_iterator`.
+    DataLoader will do the rest for you.
+    Example:
+        data_layer = DataLoader(DaliTrainPipeline, path, json, bs, ngpu)
+        data_it = data_layer.data_iterator
+        for data in data_it:
+            print(data)  # Here's your preprocessed data
+
+    Args:
+        device_type: Which device to use for preprocessing. Choose: "cpu", "gpu"
+        pipeline_type: Choose: "train", "val"
+    """
+
+    def __init__(self, gpu_id, dataset_path: str, config_data: dict, config_features: dict, json_names: list,
+                 tokenizer, batch_size: int, sampler, pipeline_type: str, grad_accumulation_steps: int = 1,
+                 device_type: str = "gpu"):
+        import torch
+        self.batch_size = batch_size
+        self.grad_accumulation_steps = grad_accumulation_steps
+        self.drop_last = (pipeline_type == 'train')
+        self.device_type = device_type
+        self.pipeline_type = self._parse_pipeline_type(pipeline_type)
+        self.sampler = sampler
+        self._dali_data_iterator = self._init_iterator(gpu_id=gpu_id, dataset_path=dataset_path,
+                                                       config_data=config_data,
+                                                       config_features=config_features,
+                                                       json_names=json_names, tokenizer=tokenizer,
+                                                       pipeline_type=pipeline_type)
+
+    def _init_iterator(self, gpu_id, dataset_path, config_data, config_features, json_names: list, tokenizer: list,
+                       pipeline_type):
+        """
+        Returns data iterator. Data underneath this operator is preprocessed within Dali
+        """
+
+        output_files, transcripts = {}, {}
+        max_duration = config_data['max_duration']
+        for jname in json_names:
+            of, tr = _parse_json(jname if jname[0] == '/' else os.path.join(dataset_path, jname), len(output_files),
+                                 predicate=lambda json: json['original_duration'] <= max_duration)
+            output_files.update(of)
+            transcripts.update(tr)
+        self.sampler.make_file_list(output_files, json_names)
+        self.dataset_size = self.sampler.get_dataset_size()
+        print_once(f"Dataset read by DALI. Number of samples: {self.dataset_size}")
+
+        pipeline = DaliPipeline.from_config(config_data=config_data, config_features=config_features, device_id=gpu_id,
+                                            file_root=dataset_path, sampler=self.sampler,
+                                            device_type=self.device_type, batch_size=self.batch_size,
+                                            pipeline_type=pipeline_type)
+
+        return DaliRnntIterator([pipeline], transcripts=transcripts, tokenizer=tokenizer, batch_size=self.batch_size,
+                                  shard_size=self._shard_size(), pipeline_type=pipeline_type)
+
+    @staticmethod
+    def _parse_pipeline_type(pipeline_type):
+        pipe = pipeline_type.lower()
+        assert pipe in ("train", "val"), 'Invalid pipeline type ("train", "val").'
+        return pipe
+
+    def _shard_size(self):
+        """
+        Total number of samples handled by a single GPU in a single epoch.
+        """
+        world_size = dist.get_world_size() if dist.is_initialized() else 1
+        if self.drop_last:
+            divisor = world_size * self.batch_size * self.grad_accumulation_steps
+            return self.dataset_size // divisor * divisor // world_size
+        else:
+            return int(math.ceil(self.dataset_size / world_size))
+
+    def __len__(self):
+        """
+        Number of batches handled by each GPU.
+        """
+        if self.drop_last:
+            assert self._shard_size() % self.batch_size == 0, f'{self._shard_size()} {self.batch_size}'
+
+        return int(math.ceil(self._shard_size() / self.batch_size))
+
+    def data_iterator(self):
+        return self._dali_data_iterator
+
+    def __iter__(self):
+        return self._dali_data_iterator
diff --git a/benchmarks/rnnt/ootb/train/common/data/dali/iterator.py b/benchmarks/rnnt/ootb/train/common/data/dali/iterator.py
new file mode 100644
index 0000000..21df27f
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/dali/iterator.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+import numpy as np
+from common.helpers import print_once
+from common.text import _clean_text, punctuation_map
+
+
+def normalize_string(s, charset, punct_map):
+    """Normalizes string.
+
+    Example:
+        'call me at 8:00 pm!' -> 'call me at eight zero pm'
+    """
+    charset = set(charset)
+    try:
+        text = _clean_text(s, ["english_cleaners"], punct_map).strip()
+        return ''.join([tok for tok in text if all(t in charset for t in tok)])
+    except:
+        print(f"WARNING: Normalizing failed: {s}")
+        return None
+
+
+class DaliRnntIterator(object):
+    """
+    Returns batches of data for RNN-T training:
+    preprocessed_signal, preprocessed_signal_length, transcript, transcript_length
+
+    This iterator is not meant to be the entry point to Dali processing pipeline.
+    Use DataLoader instead.
+    """
+
+    def __init__(self, dali_pipelines, transcripts, tokenizer, batch_size, shard_size, pipeline_type, normalize_transcripts=False):
+        self.normalize_transcripts = normalize_transcripts
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        from nvidia.dali.plugin.pytorch import DALIGenericIterator
+        from nvidia.dali.plugin.base_iterator import LastBatchPolicy
+
+        # in train pipeline shard_size is set to divisable by batch_size, so PARTIAL policy is safe
+        if pipeline_type == 'val':
+            self.dali_it = DALIGenericIterator(
+                dali_pipelines, ["audio", "label", "audio_shape"], reader_name="Reader",
+                dynamic_shape=True, auto_reset=True,
+                last_batch_policy=LastBatchPolicy.PARTIAL)
+        else:
+            self.dali_it = DALIGenericIterator(
+                dali_pipelines, ["audio", "label", "audio_shape"], size=shard_size,
+                dynamic_shape=True, auto_reset=True, last_batch_padded=True,
+                last_batch_policy=LastBatchPolicy.PARTIAL)
+
+        self.tokenize(transcripts)
+
+    def tokenize(self, transcripts):
+        transcripts = [transcripts[i] for i in range(len(transcripts))]
+        if self.normalize_transcripts:
+            transcripts = [
+                normalize_string(
+                    t,
+                    self.tokenizer.charset,
+                    punctuation_map(self.tokenizer.charset)
+                ) for t in transcripts
+            ]
+        transcripts = [self.tokenizer.tokenize(t) for t in transcripts]
+        transcripts = [torch.tensor(t) for t in transcripts]
+        self.tr = np.array(transcripts, dtype=object)
+        self.t_sizes = torch.tensor([len(t) for t in transcripts], dtype=torch.int32)
+
+    def _gen_transcripts(self, labels, normalize_transcripts: bool = True):
+        """
+        Generate transcripts in format expected by NN
+        """
+        ids = labels.flatten().numpy()
+        transcripts = self.tr[ids]
+        # Tensors are padded with 0. In `sentencepiece` we set it to <unk>,
+        # because it cannot be disabled, and is absent in the data.
+        # Note this is different from the RNN-T blank token (index 1023).
+        transcripts = torch.nn.utils.rnn.pad_sequence(transcripts, batch_first=True)
+
+        return transcripts.cuda(), self.t_sizes[ids].cuda()
+
+    def __next__(self):
+        data = self.dali_it.__next__()
+        audio, audio_shape = data[0]["audio"], data[0]["audio_shape"][:, 1]
+        if audio.shape[0] == 0:
+            # empty tensor means, other GPUs got last samples from dataset
+            # and this GPU has nothing to do; calling `__next__` raises StopIteration
+            return self.dali_it.__next__()
+        audio = audio[:, :, :audio_shape.max()] # the last batch
+        transcripts, transcripts_lengths = self._gen_transcripts(data[0]["label"])
+        return audio, audio_shape, transcripts, transcripts_lengths
+
+    def next(self):
+        return self.__next__()
+
+    def __iter__(self):
+        return self
+
+
diff --git a/benchmarks/rnnt/ootb/train/common/data/dali/pipeline.py b/benchmarks/rnnt/ootb/train/common/data/dali/pipeline.py
new file mode 100644
index 0000000..0ca66da
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/dali/pipeline.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nvidia.dali
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+import multiprocessing
+import numpy as np
+import torch
+import math
+
+class PipelineParams:
+    def __init__(
+            self,
+            sample_rate=16000,
+            max_duration=float("inf"),
+            normalize_transcripts=True,
+            trim_silence=False,
+            speed_perturbation=None
+        ):
+        pass
+
+class SpeedPerturbationParams:
+    def __init__(
+            self,
+            min_rate=0.85,
+            max_rate=1.15,
+            p=1.0,
+        ):
+        pass
+
+class DaliPipeline(nvidia.dali.pipeline.Pipeline):
+    def __init__(self, *,
+                 pipeline_type,
+                 device_id,
+                 num_threads,
+                 batch_size,
+                 file_root: str,
+                 sampler,
+                 sample_rate,
+                 resample_range: list,
+                 window_size,
+                 window_stride,
+                 nfeatures,
+                 nfft,
+                 dither_coeff,
+                 silence_threshold,
+                 preemph_coeff,
+                 max_duration,
+                 preprocessing_device="gpu"):
+        super().__init__(batch_size, num_threads, device_id)
+
+        self._dali_init_log(locals())
+
+        if torch.distributed.is_initialized():
+            shard_id = torch.distributed.get_rank()
+            n_shards = torch.distributed.get_world_size()
+        else:
+            shard_id = 0
+            n_shards = 1
+
+        self.preprocessing_device = preprocessing_device.lower()
+        assert self.preprocessing_device == "cpu" or self.preprocessing_device == "gpu", \
+            "Incorrect preprocessing device. Please choose either 'cpu' or 'gpu'"
+
+        self.resample_range = resample_range
+
+        train_pipeline = pipeline_type == 'train'
+        self.train = train_pipeline
+        self.sample_rate = sample_rate
+        self.dither_coeff = dither_coeff
+        self.nfeatures = nfeatures
+        self.max_duration = max_duration
+        self.do_remove_silence = True if silence_threshold is not None else False
+
+        shuffle = train_pipeline and not sampler.is_sampler_random()
+        self.read = ops.FileReader(name="Reader", pad_last_batch=(pipeline_type == 'val'), device="cpu", file_root=file_root, file_list=sampler.get_file_list_path(), shard_id=shard_id,
+                                   num_shards=n_shards, shuffle_after_epoch=shuffle)
+
+        if resample_range is not None:
+            self.speed_perturbation_coeffs = ops.Uniform(device="cpu", range=resample_range)
+        else:
+            self.speed_perturbation_coeffs = None
+
+        self.decode = ops.AudioDecoder(device="cpu", sample_rate=self.sample_rate if resample_range is None else None,
+                                       dtype=types.FLOAT, downmix=True)
+
+        self.normal_distribution = ops.NormalDistribution(device=preprocessing_device)
+
+        self.preemph = ops.PreemphasisFilter(device=preprocessing_device, preemph_coeff=preemph_coeff)
+
+        self.spectrogram = ops.Spectrogram(device=preprocessing_device, nfft=nfft,
+                                           window_length=window_size * sample_rate,
+                                           window_step=window_stride * sample_rate)
+
+        self.mel_fbank = ops.MelFilterBank(device=preprocessing_device, sample_rate=sample_rate, nfilter=self.nfeatures,
+                                           normalize=True)
+
+        self.log_features = ops.ToDecibels(device=preprocessing_device, multiplier=np.log(10), reference=1.0,
+                                           cutoff_db=math.log(1e-20))
+
+        self.get_shape = ops.Shapes(device=preprocessing_device)
+
+        self.normalize = ops.Normalize(device=preprocessing_device, axes=[1])
+
+        self.pad = ops.Pad(device=preprocessing_device, fill_value=0)
+
+        # Silence trimming
+        self.get_nonsilent_region = ops.NonsilentRegion(device="cpu", cutoff_db=silence_threshold)
+        self.trim_silence = ops.Slice(device="cpu", normalized_anchor=False, normalized_shape=False, axes=[0])
+        self.to_float = ops.Cast(device="cpu", dtype=types.FLOAT)
+
+    @classmethod
+    def from_config(cls, pipeline_type, device_id, batch_size, file_root: str, sampler, config_data: dict,
+                    config_features: dict, device_type: str = "gpu", do_resampling: bool = True,
+                    num_cpu_threads=multiprocessing.cpu_count()):
+
+        max_duration = config_data['max_duration']
+        sample_rate = config_data['sample_rate']
+        silence_threshold = -60 if config_data['trim_silence'] else None
+
+        if do_resampling and config_data['speed_perturbation'] is not None:
+            resample_range = [config_data['speed_perturbation']['min_rate'],
+                              config_data['speed_perturbation']['max_rate']]
+        else:
+            resample_range = None
+
+        window_size = config_features['window_size']
+        window_stride = config_features['window_stride']
+        nfeatures = config_features['n_filt']
+        nfft = config_features['n_fft']
+        dither_coeff = config_features['dither']
+        preemph_coeff = .97
+
+        return cls(pipeline_type=pipeline_type,
+                   device_id=device_id,
+                   preprocessing_device=device_type,
+                   num_threads=num_cpu_threads,
+                   batch_size=batch_size,
+                   file_root=file_root,
+                   sampler=sampler,
+                   sample_rate=sample_rate,
+                   resample_range=resample_range,
+                   window_size=window_size,
+                   window_stride=window_stride,
+                   nfeatures=nfeatures,
+                   nfft=nfft,
+                   dither_coeff=dither_coeff,
+                   silence_threshold=silence_threshold,
+                   preemph_coeff=preemph_coeff,
+                   max_duration=max_duration,
+        )
+
+    @staticmethod
+    def _dali_init_log(args: dict):
+        if (not torch.distributed.is_initialized() or (
+                torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):  # print once
+            max_len = max([len(ii) for ii in args.keys()])
+            fmt_string = '\t%' + str(max_len) + 's : %s'
+            print('Initializing DALI with parameters:')
+            for keyPair in sorted(args.items()):
+                print(fmt_string % keyPair)
+
+    def _remove_silence(self, inp):
+        begin, length = self.get_nonsilent_region(inp)
+        out = self.trim_silence(inp, self.to_float(begin), self.to_float(length))
+        return out
+
+    def define_graph(self):
+        audio, label = self.read()
+        if not self.train or self.speed_perturbation_coeffs is None:
+            audio, sr = self.decode(audio)
+        else:
+            resample_coeffs = self.speed_perturbation_coeffs() * self.sample_rate
+            audio, sr = self.decode(audio, sample_rate=resample_coeffs)
+
+        if self.do_remove_silence:
+            audio = self._remove_silence(audio)
+
+        # Max duration drop is performed at DataLayer stage
+
+        if self.preprocessing_device == "gpu":
+            audio = audio.gpu()
+
+        if self.dither_coeff != 0.:
+            audio = audio + self.normal_distribution(audio) * self.dither_coeff
+
+        audio = self.preemph(audio)
+
+        audio = self.spectrogram(audio)
+        audio = self.mel_fbank(audio)
+        audio = self.log_features(audio)
+
+        audio_len = self.get_shape(audio)
+
+        audio = self.normalize(audio)
+        audio = self.pad(audio)
+
+        # When modifying DALI pipeline returns, make sure you update `output_map` in DALIGenericIterator invocation
+        return audio.gpu(), label, audio_len.gpu()
+
diff --git a/benchmarks/rnnt/ootb/train/common/data/dali/sampler.py b/benchmarks/rnnt/ootb/train/common/data/dali/sampler.py
new file mode 100644
index 0000000..3f0acaf
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/dali/sampler.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+
+def hash_list_of_strings(li):
+    return str(abs(hash(''.join(li))))
+
+
+class SimpleSampler:
+    def __init__(self):
+        self.file_list_path = None
+        self.dataset_size = None
+
+    def write_file_list(self, files):
+        with open(self.file_list_path, 'w') as f:
+            f.writelines(f'{name} {label}\n' for name, label in files)
+
+    def get_file_list_path(self):
+        assert self.file_list_path, 'File list not initialized. Run make_file_list first'
+        return self.file_list_path
+
+    def get_dataset_size(self):
+        assert self.dataset_size, 'Dataset size not known. Run make_file_list first'
+        return self.dataset_size
+
+    def is_sampler_random(self):
+        return False
+
+    def process_output_files(self, output_files):
+        self.dataset_size = len(output_files)
+        return [ (path, entry['label']) for path, entry in output_files.items() ]
+
+    def make_file_list(self, output_files, json_names):
+        self.file_list_path = os.path.join(
+            "/tmp",
+            "rnnt_dali.file_list." + hash_list_of_strings(json_names)
+        )
+        self.write_file_list(self.process_output_files(output_files))
+
+
+class BucketingSampler(SimpleSampler):
+    def __init__(self, num_buckets, batch_size, num_workers, num_epochs, rng):
+        super(BucketingSampler, self).__init__()
+        self.rng = rng
+        self.num_buckets = num_buckets
+        self.num_epochs = num_epochs
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+
+    def process_output_files(self, output_files):
+        names = list(output_files)
+        lengths = [output_files[name]['duration'] for name in names]
+        labels = np.array([output_files[name]['label'] for name in names])
+        len_ids = np.argsort(lengths)
+        buckets = np.array_split(len_ids, self.num_buckets)
+
+        gbs = self.batch_size * self.num_workers
+        shuffled_buckets = np.array([
+            perm
+            for _ in range(self.num_epochs)          # for every epoch
+            for bucket in buckets                    # from every bucket
+            for perm in self.rng.permutation(bucket) # pick samples in random order
+        ])
+
+        # drop last batch
+        epochs = np.reshape(shuffled_buckets, [self.num_epochs, -1])
+        to_drop = epochs.shape[1] - (epochs.shape[1] // gbs * gbs)
+        for epoch in epochs:
+            dropped_idxs = self.rng.choice(epochs.shape[1], to_drop, replace=False)
+            if dropped_idxs is not None:
+                epoch[dropped_idxs] = -1
+        epochs = epochs[epochs != -1].reshape(self.num_epochs, -1)
+        self.dataset_size = epochs.shape[1]
+
+        epochs_iters_batch = np.reshape(epochs, [self.num_epochs, -1, gbs])
+
+        # shuffle iterations in epochs perserving batches
+        for epoch in epochs_iters_batch:
+            self.rng.shuffle(epoch, axis=0)
+
+        epochs_iters_batch_worker = np.reshape(
+            epochs_iters_batch,
+            [self.num_epochs, -1, self.batch_size, self.num_workers]
+        )
+        workers_epochs_iters_batch = np.moveaxis(epochs_iters_batch_worker, -1, 0)
+
+        return [
+            (names[i], labels[i])
+            for i in workers_epochs_iters_batch.flatten()
+        ]
+
+    def is_sampler_random(self):
+        return True
+
diff --git a/benchmarks/rnnt/ootb/train/common/data/dataset.py b/benchmarks/rnnt/ootb/train/common/data/dataset.py
new file mode 100644
index 0000000..f450c10
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/dataset.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+
+import numpy as np
+
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from common.audio import (audio_from_file, AudioSegment, SpeedPerturbation)
+from common.text import _clean_text, punctuation_map
+
+from common.helpers import print_once
+from common.sampler import BucketingSampler
+
+
+def normalize_string(s, charset, punct_map):
+    """Normalizes string.
+
+    Example:
+        'call me at 8:00 pm!' -> 'call me at eight zero pm'
+    """
+    charset = set(charset)
+    try:
+        text = _clean_text(s, ["english_cleaners"], punct_map).strip()
+        return ''.join([tok for tok in text if all(t in charset for t in tok)])
+    except:
+        print(f"WARNING: Normalizing failed: {s}")
+        return None
+
+
+class FilelistDataset(Dataset):
+    def __init__(self, filelist_fpath):
+        self.samples = [line.strip() for line in open(filelist_fpath, 'r')]
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, index):
+        audio, audio_len = audio_from_file(self.samples[index])
+        return (audio.squeeze(0), audio_len, torch.LongTensor([0]),
+                torch.LongTensor([0]))
+
+
+class SingleAudioDataset(FilelistDataset):
+    def __init__(self, audio_fpath):
+        self.samples = [audio_fpath]
+
+
+class AudioDataset(Dataset):
+    def __init__(self, data_dir, manifest_fpaths,
+                 tokenizer,
+                 sample_rate=16000, min_duration=0.1, max_duration=float("inf"),
+                 max_utts=0, normalize_transcripts=True,
+                 trim_silence=False,
+                 speed_perturbation=None,
+                 ignore_offline_speed_perturbation=False):
+        """Loads audio, transcript and durations listed in a .json file.
+
+        Args:
+            data_dir: absolute path to dataset folder
+            manifest_filepath: relative path from dataset folder
+                to manifest json as described above. Can be coma-separated paths.
+            tokenizer: class converting transcript to tokens
+            min_duration (int): skip audio shorter than threshold
+            max_duration (int): skip audio longer than threshold
+            max_utts (int): limit number of utterances
+            normalize_transcripts (bool): normalize transcript text
+            trim_silence (bool): trim leading and trailing silence from audio
+            ignore_offline_speed_perturbation (bool): use precomputed speed perturbation
+
+        Returns:
+            tuple of Tensors
+        """
+        self.data_dir = data_dir
+
+        self.tokenizer = tokenizer
+        self.punctuation_map = punctuation_map(self.tokenizer.charset)
+
+        self.max_utts = max_utts
+        self.normalize_transcripts = normalize_transcripts
+        self.ignore_offline_speed_perturbation = ignore_offline_speed_perturbation
+
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.trim_silence = trim_silence
+        self.sample_rate = sample_rate
+
+        perturbations = []
+        if speed_perturbation is not None:
+            perturbations.append(SpeedPerturbation(**speed_perturbation))
+        self.perturbations = perturbations
+
+        self.max_duration = max_duration
+
+        self.samples = []
+        self.duration = 0.0
+        self.duration_filtered = 0.0
+
+        for fpath in manifest_fpaths:
+            self._load_json_manifest(fpath)
+
+    def __getitem__(self, index):
+        s = self.samples[index]
+        rn_indx = np.random.randint(len(s['audio_filepath']))
+        duration = s['audio_duration'][rn_indx] if 'audio_duration' in s else 0
+        offset = s.get('offset', 0)
+
+        segment = AudioSegment(
+            s['audio_filepath'][rn_indx], target_sr=self.sample_rate,
+            offset=offset, duration=duration, trim=self.trim_silence)
+
+        for p in self.perturbations:
+            p.maybe_apply(segment, self.sample_rate)
+
+        segment = torch.FloatTensor(segment.samples)
+
+        return (segment,
+                torch.tensor(segment.shape[0]).int(),
+                torch.tensor(s["transcript"]),
+                torch.tensor(len(s["transcript"])).int())
+
+    def __len__(self):
+        return len(self.samples)
+
+    def _load_json_manifest(self, fpath):
+        j = json.load(open(fpath, "r", encoding="utf-8"))
+        for i, s in enumerate(j):
+            if i % 1000 == 0:
+                print(f'{i:>10}/{len(j):<10}', end='\r')
+
+            s_max_duration = s['original_duration']
+
+            s['duration'] = s.pop('original_duration')
+            if not (self.min_duration <= s_max_duration <= self.max_duration):
+                self.duration_filtered += s['duration']
+                continue
+
+            # Prune and normalize according to transcript
+            tr = (s.get('transcript', None) or
+                  self.load_transcript(s['text_filepath']))
+
+            if not isinstance(tr, str):
+                print(f'WARNING: Skipped sample (transcript not a str): {tr}.')
+                self.duration_filtered += s['duration']
+                continue
+
+            if self.normalize_transcripts:
+                tr = normalize_string(tr, self.tokenizer.charset, self.punctuation_map)
+
+            s["transcript"] = self.tokenizer.tokenize(tr)
+
+            files = s.pop('files')
+            if self.ignore_offline_speed_perturbation:
+                files = [f for f in files if f['speed'] == 1.0]
+
+            s['audio_duration'] = [f['duration'] for f in files]
+            s['audio_filepath'] = [str(Path(self.data_dir, f['fname']))
+                                   for f in files]
+            self.samples.append(s)
+            self.duration += s['duration']
+
+            if self.max_utts > 0 and len(self.samples) >= self.max_utts:
+                print(f'Reached max_utts={self.max_utts}. Finished parsing {fpath}.')
+                break
+
+    def load_transcript(self, transcript_path):
+        with open(transcript_path, 'r', encoding="utf-8") as transcript_file:
+            transcript = transcript_file.read().replace('\n', '')
+        return transcript
+
+def collate_fn(batch):
+    bs = len(batch)
+    max_len = lambda l, idx: max(el[idx].size(0) for el in l)
+    audio = torch.zeros(bs, max_len(batch, 0))
+    audio_lens = torch.zeros(bs, dtype=torch.int32)
+    transcript = torch.zeros(bs, max_len(batch, 2))
+    transcript_lens = torch.zeros(bs, dtype=torch.int32)
+
+    for i, sample in enumerate(batch):
+        audio[i].narrow(0, 0, sample[0].size(0)).copy_(sample[0])
+        audio_lens[i] = sample[1]
+        transcript[i].narrow(0, 0, sample[2].size(0)).copy_(sample[2])
+        transcript_lens[i] = sample[3]
+    return audio, audio_lens, transcript, transcript_lens
+
+
+def get_data_loader(dataset, batch_size, world_size, rank, shuffle=True,
+                    drop_last=True, num_workers=4, num_buckets=None):
+    if world_size != 1:
+        loader_shuffle = False
+        if num_buckets:
+            assert shuffle, 'only random buckets are supported'
+            sampler = BucketingSampler(
+                dataset,
+                batch_size,
+                num_buckets,
+                world_size,
+                rank,
+            )
+            print('Using BucketingSampler')
+        else:
+            sampler = DistributedSampler(dataset, shuffle=shuffle)
+            print('Using DistributedSampler')
+    else:
+        loader_shuffle = shuffle
+        sampler = None
+        print('Using no sampler')
+
+    return DataLoader(
+        batch_size=batch_size,
+        drop_last=drop_last,
+        sampler=sampler,
+        shuffle=loader_shuffle,
+        dataset=dataset,
+        collate_fn=collate_fn,
+        num_workers=num_workers,
+        pin_memory=True
+    )
diff --git a/benchmarks/rnnt/ootb/train/common/data/features.py b/benchmarks/rnnt/ootb/train/common/data/features.py
new file mode 100644
index 0000000..8f2d64a
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/features.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+
+import librosa
+import torch
+import torch.nn as nn
+
+from apex import amp
+
+
+class BaseFeatures(nn.Module):
+    """Base class for GPU accelerated audio preprocessing."""
+    def __init__(self, optim_level):
+        super(BaseFeatures, self).__init__()
+        self.optim_level = optim_level
+
+    @torch.no_grad()
+    def calculate_features(self, audio, audio_lens):
+        return audio, audio_lens
+
+    def __call__(self, x):
+        audio, audio_lens = x
+        if self.optim_level == 1:
+            with amp.disable_casts():
+                return self.calculate_features(audio, audio_lens)
+        else:
+            return self.calculate_features(audio, audio_lens)
+
+
+class SpecAugment(BaseFeatures):
+    """Regularize by masking entire time steps/frequency bands.
+
+    Implementes SpecAugment (https://arxiv.org/abs/1904.08779)
+    with adaptive masking (https://arxiv.org/abs/1912.05533), without time
+    warping.
+
+    Args:
+        freq_masks (int): number of masks for frequency bands
+        min_freq (int): minimum number of frequencies in a single mask
+        max_freq (int or float): maximum number of frequencies in a single mask
+        time_masks (int or float): number of masks or adaptive percentage
+        min_time (int): minimum number of masked time steps per mask; applies
+            only if max is non-adaptive
+        max_time (int or float): maximum number of masked time steps per mask,
+            value 0 < 1 then denotes adaptive percentage
+        noise_magnitude (float): mask with N(0, noise_magnitude * std(sample))
+            noise instead of zeros to stabilize training
+    """
+    def __init__(self, optim_level, freq_masks=0, min_freq=0, max_freq=10, time_masks=0,
+                 min_time=0, max_time=10, noise_magnitude=0):
+        super(SpecAugment, self).__init__(optim_level)
+        assert 0 <= min_freq <= max_freq
+        assert 0 <= min_time <= max_time
+
+        self.freq_masks = freq_masks
+        self.min_freq = min_freq
+        self.max_freq = max_freq
+
+        self.time_masks = time_masks
+        self.min_time = min_time
+        self.max_time = max_time
+
+        self.noise_magnitude = noise_magnitude
+
+    @torch.no_grad()
+    def calculate_features(self, x, x_lens):
+        sh = x.shape
+        mask = torch.zeros(x.shape, dtype=torch.bool, device=x.device)
+
+        for idx in range(sh[0]):
+
+            for _ in range(self.freq_masks):
+                w = torch.randint(self.min_freq, self.max_freq + 1, size=(1,)).item()
+                f0 = torch.randint(0, max(1, sh[1] - w + 1), size=(1,))
+                mask[idx, f0:f0+w] = 1
+
+            # Adaptive time masking
+            time_masks = self.time_masks
+            if 0 < time_masks < 1.0:
+                time_masks = int(round(x_lens[idx].item() * time_masks))
+
+            max_time = self.max_time
+            if 0 < max_time < 1.0:
+                max_time = int(round(x_lens[idx].item() * max_time))
+
+            for _ in range(time_masks):
+                w = torch.randint(self.min_time, max_time + 1, size=(1,)).item()
+                t0 = torch.randint(0, max(1, sh[2] - w + 1), size=(1,))
+                mask[idx, :, t0:t0+w] = 1
+
+        if self.noise_magnitude > 0:
+            mean = torch.zeros(x.size(0), x.size(1), 1, device=x.device)
+            std = torch.zeros(x.size(0), x.size(1), 1, device=x.device)
+            for idx in range(sh[0]):
+                mean[idx, :, 0] = x[idx, :, :x_lens[idx]].mean(dim=1)
+                std[idx, :, 0] = x[idx, :, :x_lens[idx]].mean(dim=1)
+
+            std *= self.noise_magnitude
+            noise = (mean + torch.randn_like(x) * std).masked_fill(~mask, 0)
+        else:
+            noise = 0
+
+        return x.masked_fill(mask, 0) + noise, x_lens
+
+
+@torch.jit.script
+def normalize_batch(x, x_lens, normalize_type: str):
+    if normalize_type == "per_feature":
+        mean = x.new_zeros(x.size(0), x.size(1))
+        std = x.new_zeros(x.size(0), x.size(1))
+
+        for i in range(x.size(0)):
+            mean[i, :] = x[i, :, :x_lens[i]].mean(dim=1)
+            std[i, :] = x[i, :, :x_lens[i]].std(dim=1)
+        # make sure std is not zero
+        return (x - mean.unsqueeze(2)) / (std.unsqueeze(2) + 1e-5)
+
+    elif normalize_type == "all_features":
+        mean = x.new_zeros(x.size(0))
+        std = x.new_zeros(x.size(0))
+        for i in range(x.size(0)):
+            mean[i] = x[i, :, :x_lens[i]].mean()
+            std[i] = x[i, :, :x_lens[i]].std()
+        # make sure x_std is not zero
+        return (x - mean.view(-1, 1, 1)) / (std.view(-1, 1, 1) + 1e-5)
+    else:
+        return x
+
+
+def stack_subsample_frames(x, x_lens, stacking=1, subsampling=1):
+    """ Stacks frames together across feature dim, and then subsamples
+
+    input is batch_size, feature_dim, num_frames
+    output is batch_size, feature_dim * stacking, num_frames / subsampling
+
+    """
+    seq = [x]
+    for n in range(1, stacking):
+        tmp = torch.zeros_like(x)
+        tmp[:, :, :-n] = x[:, :, n:]
+        seq.append(tmp)
+    x = torch.cat(seq, dim=1)[:, :, ::subsampling]
+
+    if subsampling > 1:
+        x_lens = torch.ceil(x_lens.float() / subsampling).int()
+
+        if x.size(2) > x_lens.max().item():
+            assert abs(x.size(2) - x_lens.max().item()) <= 1
+            x = x[:,:,:x_lens.max().item()]
+
+    return x, x_lens
+
+
+class FilterbankFeatures(BaseFeatures):
+    # For JIT, https://pytorch.org/docs/stable/jit.html#python-defined-constants
+    __constants__ = ["dither", "preemph", "n_fft", "hop_length", "win_length",
+                     "log", "normalize"]
+    # torchscript: "center" removed due to a bug
+
+    def __init__(self,
+                 optim_level, sample_rate=8000, window_size=0.02, window_stride=0.01,
+                 window="hamming", normalize="per_feature", n_fft=None,
+                 preemph=0.97, n_filt=64, lowfreq=0, highfreq=None, log=True,
+                 dither=1e-5):
+        super(FilterbankFeatures, self).__init__(optim_level)
+        torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'none': None,
+        }
+
+        self.win_length = int(sample_rate * window_size) # frame size
+        self.hop_length = int(sample_rate * window_stride)
+        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
+
+        self.normalize = normalize
+        self.log = log
+        #TORCHSCRIPT: Check whether or not we need this
+        self.dither = dither
+        self.n_filt = n_filt
+        self.preemph = preemph
+        highfreq = highfreq or sample_rate / 2
+        window_fn = torch_windows.get(window, None)
+        window_tensor = window_fn(self.win_length,
+                                  periodic=False) if window_fn else None
+        filterbanks = torch.tensor(
+            librosa.filters.mel(sample_rate, self.n_fft, n_mels=n_filt,
+                                fmin=lowfreq, fmax=highfreq),
+            dtype=torch.float).unsqueeze(0)
+        # torchscript
+        self.register_buffer("fb", filterbanks)
+        self.register_buffer("window", window_tensor)
+
+    # do stft
+    # TORCHSCRIPT: center removed due to bug
+    def stft(self, x):
+        return torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
+                          win_length=self.win_length,
+                          window=self.window.to(dtype=torch.float))
+                          # return_complex=False)
+
+
+    @torch.no_grad()
+    def calculate_features(self, x, x_lens):
+        if self.dither > 0:
+            x += self.dither * torch.randn_like(x)
+
+        if self.preemph is not None:
+            x = torch.cat(
+                (x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1)
+        x  = self.stft(x).to(x.dtype)
+
+        x_lens = torch.ceil(x_lens.float() / self.hop_length).int()
+
+        # get power spectrum
+        x = x.pow(2).sum(-1)
+
+        # dot with filterbank energies
+        x = torch.matmul(self.fb.to(x.dtype), x)
+
+        if self.log:
+            x = torch.log(x + 1e-20)
+
+        # normalize if required
+        x = normalize_batch(x, x_lens, normalize_type=self.normalize)
+
+        return x, x_lens
+
+class FrameSplicing(BaseFeatures):
+    __constants__ = ['frame_subsampling', 'frame_stacking']
+
+    def __init__(self, optim_level, frame_stacking=1, frame_subsampling=1):
+        super(FrameSplicing, self).__init__(optim_level)
+        self.frame_stacking = frame_stacking
+        self.frame_subsampling = frame_subsampling
+
+    def calculate_features(self, x, x_lens):
+
+        # frame splicing if required
+        if self.frame_stacking > 1 or self.frame_subsampling > 1:
+            x, x_lens = stack_subsample_frames(x, x_lens, self.frame_stacking,
+                                               self.frame_subsampling)
+
+        return x, x_lens
+
+class FillPadding(BaseFeatures):
+    __constants__ = [ 'fill_value' ]
+    def __init__(self, optim_level, fill_value=0):
+        super(FillPadding, self).__init__(optim_level)
+        self.fill_value = fill_value
+
+    def calculate_features(self, x, x_lens):
+        # mask to zero any values beyond x_lens in batch,
+        max_len = x.size(-1)
+        mask = torch.arange(max_len, dtype=x_lens.dtype, device=x.device)
+        mask = mask.expand(x.size(0), max_len) >= x_lens.unsqueeze(1)
+        x = x.masked_fill(mask.unsqueeze(1), self.fill_value)
+
+        return x, x_lens
+
diff --git a/benchmarks/rnnt/ootb/train/common/data/helpers.py b/benchmarks/rnnt/ootb/train/common/data/helpers.py
new file mode 100644
index 0000000..3879635
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/helpers.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from common.data.dali.data_loader import DaliDataLoader
+
+
+def dataset_size(dataset):
+    if isinstance(dataset, DaliDataLoader): # DALI
+        return dataset.dataset_size
+    else: # PyTorch
+        return dataset.sampler.num_samples
diff --git a/benchmarks/rnnt/ootb/train/common/data/text.py b/benchmarks/rnnt/ootb/train/common/data/text.py
new file mode 100644
index 0000000..36b78fe
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/data/text.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sentencepiece as spm
+
+
+class Tokenizer:
+    def __init__(self, labels, sentpiece_model=None):
+        """Converts transcript to a sequence of tokens.
+
+        Args:
+            labels (str): all possible output symbols
+        """
+        # For labels use vocab or load worpieces
+        self.charset = labels
+        self.use_sentpiece = (sentpiece_model is not None)
+        if self.use_sentpiece:
+            self.sentpiece = spm.SentencePieceProcessor(model_file=sentpiece_model)
+            self.num_labels = len(self.sentpiece)
+        else:
+            self.num_labels = len(self.charset)
+            self.label2ind = {lab: i for i, lab in enumerate(self.charset)}
+
+    def tokenize(self, transcript):
+        if self.use_sentpiece:
+            inds = self.sentpiece.encode(transcript, out_type=int)
+            assert 0 not in inds, '<unk> found during tokenization (OOV?)'
+        else:
+            inds = [self.label2ind[x]
+                    for x in transcript if x in self.label2ind]
+        return inds
+
+    def detokenize(self, inds):
+        if self.use_sentpiece:
+            return self.sentpiece.decode(inds)
+        else:
+            return ''.join(self.charset[i] for i in inds)
+
+
diff --git a/benchmarks/rnnt/ootb/train/common/helpers.py b/benchmarks/rnnt/ootb/train/common/helpers.py
new file mode 100644
index 0000000..0d12382
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/helpers.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import re
+from collections import OrderedDict
+
+from apex import amp
+
+import torch
+import torch.distributed as dist
+
+from .metrics import word_error_rate
+
+
+def __rnnt_decoder_predictions_tensor(tensor, detokenize):
+    """
+    Takes output of greedy rnnt decoder and converts to strings.
+    Args:
+        tensor: model output tensor
+        label: A list of labels
+    Returns:
+        prediction
+    """
+    return [detokenize(pred) for pred in tensor]
+
+
+def print_once(msg):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(msg)
+
+
+def greedy_wer(preds, tgt, tgt_lens, detokenize):
+    """
+    Takes output of greedy ctc decoder and performs ctc decoding algorithm to
+    remove duplicates and special symbol. Prints wer and prediction examples to screen
+    Args:
+        tensors: A list of 3 tensors (predictions, targets, target_lengths)
+        labels: A list of labels
+
+    Returns:
+        word error rate
+    """
+    with torch.no_grad():
+        references = gather_transcripts([tgt], [tgt_lens], detokenize)
+        hypotheses = __rnnt_decoder_predictions_tensor(preds, detokenize)
+
+    wer, _, _ = word_error_rate(hypotheses, references)
+    return wer, hypotheses[0], references[0]
+
+
+def gather_losses(losses_list):
+    return [torch.mean(torch.stack(losses_list))]
+
+
+def gather_predictions(predictions_list, detokenize):
+    rnnt_predictions = (
+        __rnnt_decoder_predictions_tensor(prediction, detokenize)
+        for prediction in predictions_list
+    )
+
+    return [
+        prediction
+        for batch in rnnt_predictions
+        for prediction in batch
+    ]
+
+
+def gather_transcripts(transcript_list, transcript_len_list, detokenize):
+    return [
+        detokenize(t[:l].long().cpu().numpy().tolist())
+        for txt, lens in zip(transcript_list, transcript_len_list)
+        for t, l in zip(txt, lens)
+    ]
+
+
+def process_evaluation_epoch(aggregates):
+    """
+    Processes results from each worker at the end of evaluation and combine to final result
+    Args:
+        aggregates: dictionary containing information of entire evaluation
+    Return:
+        wer: final word error rate
+        loss: final loss
+    """
+    if 'losses' in aggregates:
+        eloss = torch.mean(torch.stack(aggregates['losses'])).item()
+    else:
+        eloss = None
+
+    hypotheses = aggregates['preds']
+    references = aggregates['txts']
+
+    wer, scores, num_words = word_error_rate(hypotheses, references)
+    multi_gpu = dist.is_initialized()
+    if multi_gpu:
+        if eloss is not None:
+            eloss /= dist.get_world_size()
+            eloss_tensor = torch.tensor(eloss).cuda()
+            dist.all_reduce(eloss_tensor)
+            eloss = eloss_tensor.item()
+
+        scores_tensor = torch.tensor(scores).cuda()
+        dist.all_reduce(scores_tensor)
+        scores = scores_tensor.item()
+        num_words_tensor = torch.tensor(num_words).cuda()
+        dist.all_reduce(num_words_tensor)
+        num_words = num_words_tensor.item()
+        wer = scores * 1.0 / num_words
+    return wer, eloss
+
+
+def num_weights(module):
+    return sum(p.numel() for p in module.parameters() if p.requires_grad)
+
+
+class Checkpointer(object):
+
+    def __init__(self, save_dir, model_name, keep_milestones=[100,200,300],
+                 use_amp=False):
+        self.save_dir = save_dir
+        self.keep_milestones = keep_milestones
+        self.use_amp = use_amp
+        self.model_name = model_name
+
+        tracked = [
+            (int(re.search('epoch(\d+)_', f).group(1)), f)
+            for f in glob.glob(f'{save_dir}/{self.model_name}_epoch*_checkpoint.pt')]
+        tracked = sorted(tracked, key=lambda t: t[0])
+        self.tracked = OrderedDict(tracked)
+
+    def save(self, model, ema_model, optimizer, epoch, step, best_wer,
+             is_best=False):
+        """Saves model checkpoint for inference/resuming training.
+
+        Args:
+            model: the model, optionally wrapped by DistributedDataParallel
+            ema_model: model with averaged weights, can be None
+            optimizer: optimizer
+            epoch (int): epoch during which the model is saved
+            step (int): number of steps since beginning of training
+            best_wer (float): lowest recorded WER on the dev set
+            is_best (bool, optional): set name of checkpoint to 'best'
+                and overwrite the previous one
+        """
+        rank = 0
+        if dist.is_initialized():
+            dist.barrier()
+            rank = dist.get_rank()
+
+        if rank != 0:
+            return
+
+        # Checkpoint already saved
+        if not is_best and epoch in self.tracked:
+            return
+
+        unwrap_ddp = lambda model: getattr(model, 'module', model)
+        state = {
+            'epoch': epoch,
+            'step': step,
+            'best_wer': best_wer,
+            'state_dict': unwrap_ddp(model).state_dict(),
+            'ema_state_dict': unwrap_ddp(ema_model).state_dict() if ema_model is not None else None,
+            'optimizer': optimizer.state_dict(),
+            'amp': amp.state_dict() if self.use_amp else None,
+        }
+
+        if is_best:
+            fpath = os.path.join(
+                self.save_dir, f"{self.model_name}_best_checkpoint.pt")
+        else:
+            fpath = os.path.join(
+                self.save_dir, f"{self.model_name}_epoch{epoch}_checkpoint.pt")
+
+        print_once(f"Saving {fpath}...")
+        torch.save(state, fpath)
+
+        if not is_best:
+            # Remove old checkpoints; keep milestones and the last two
+            self.tracked[epoch] = fpath
+            for epoch in set(list(self.tracked)[:-2]) - set(self.keep_milestones):
+                try:
+                    os.remove(self.tracked[epoch])
+                except:
+                    pass
+                del self.tracked[epoch]
+
+    def last_checkpoint(self):
+        tracked = list(self.tracked.values())
+
+        if len(tracked) >= 1:
+            try:
+                torch.load(tracked[-1], map_location='cpu')
+                return tracked[-1]
+            except:
+                print_once(f'Last checkpoint {tracked[-1]} appears corrupted.')
+
+        elif len(tracked) >= 2:
+            return tracked[-2]
+        else:
+            return None
+
+    def load(self, fpath, model, ema_model, optimizer, meta):
+
+        print_once(f'Loading model from {fpath}')
+        checkpoint = torch.load(fpath, map_location="cpu")
+
+        unwrap_ddp = lambda model: getattr(model, 'module', model)
+        state_dict = checkpoint['state_dict']
+        unwrap_ddp(model).load_state_dict(state_dict, strict=False)
+
+        if ema_model is not None:
+            if checkpoint.get('ema_state_dict') is not None:
+                key = 'ema_state_dict'
+            else:
+                key = 'state_dict'
+                print_once('WARNING: EMA weights not found in the checkpoint.')
+                print_once('WARNING: Initializing EMA model with regular params.')
+            state_dict = checkpoint[key]
+            unwrap_ddp(ema_model).load_state_dict(state_dict, strict=False)
+
+        optimizer.load_state_dict(checkpoint['optimizer'])
+
+        if self.use_amp:
+            amp.load_state_dict(checkpoint['amp'])
+
+        meta['start_epoch'] = checkpoint.get('epoch')
+        meta['best_wer'] = checkpoint.get('best_wer', meta['best_wer'])
diff --git a/benchmarks/rnnt/ootb/train/common/metrics.py b/benchmarks/rnnt/ootb/train/common/metrics.py
new file mode 100644
index 0000000..4ae47a4
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/metrics.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def __levenshtein(a, b):
+    """Calculates the Levenshtein distance between two sequences."""
+
+    n, m = len(a), len(b)
+    if n > m:
+        # Make sure n <= m, to use O(min(n,m)) space
+        a, b = b, a
+        n, m = m, n
+
+    current = list(range(n + 1))
+    for i in range(1, m + 1):
+        previous, current = current, [i] + [0] * n
+        for j in range(1, n + 1):
+            add, delete = previous[j] + 1, current[j - 1] + 1
+            change = previous[j - 1]
+            if a[j - 1] != b[i - 1]:
+                change = change + 1
+            current[j] = min(add, delete, change)
+
+    return current[n]
+
+
+def word_error_rate(hypotheses, references):
+    """Computes average Word Error Rate (WER) between two text lists."""
+
+    scores = 0
+    words = 0
+    len_diff = len(references) - len(hypotheses)
+    if len_diff > 0:
+        raise ValueError("Uneqal number of hypthoses and references: "
+                         "{0} and {1}".format(len(hypotheses), len(references)))
+    elif len_diff < 0:
+        hypotheses = hypotheses[:len_diff]
+
+    for h, r in zip(hypotheses, references):
+        h_list = h.split()
+        r_list = r.split()
+        words += len(r_list)
+        scores += __levenshtein(h_list, r_list)
+    if words!=0:
+        wer = 1.0*scores/words
+    else:
+        wer = float('inf')
+    return wer, scores, words
diff --git a/benchmarks/rnnt/ootb/train/common/optimizers.py b/benchmarks/rnnt/ootb/train/common/optimizers.py
new file mode 100644
index 0000000..ccb4a0c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/optimizers.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.optim import Optimizer
+import math
+
+
+def lr_policy(step, epoch, initial_lr, optimizer, steps_per_epoch, warmup_epochs,
+              hold_epochs, min_lr=1e-5,
+              exp_gamma=None):
+    """
+    learning rate decay
+    Args:
+        initial_lr: base learning rate
+        step: current iteration number
+        N: total number of iterations over which learning rate is decayed
+        lr_steps: list of steps to apply exp_gamma
+    """
+    warmup_steps = warmup_epochs * steps_per_epoch
+    hold_steps = hold_epochs * steps_per_epoch
+
+    assert exp_gamma is not None
+
+    if step < warmup_steps:
+        a = (step + 1) / (warmup_steps + 1)
+    elif step < warmup_steps + hold_steps:
+        a = 1.0
+    else:
+        a = exp_gamma ** (epoch - warmup_epochs - hold_epochs)
+
+    if type(initial_lr) is float:
+        initial_lr = [initial_lr]
+
+    assert len(initial_lr) == len(optimizer.param_groups)
+
+    for lr, param_group in zip(initial_lr, optimizer.param_groups):
+        param_group['lr'] = max(a * lr, min_lr)
+
+
+class AdamW(Optimizer):
+    """Implements AdamW algorithm.
+  
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+  
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+  
+        Adam: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+        On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+  
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                  weight_decay=0, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW, self).__init__(params, defaults)
+  
+    def __setstate__(self, state):
+        super(AdamW, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+  
+    def step(self, closure=None):
+        """Performs a single optimization step.
+  
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+  
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                amsgrad = group['amsgrad']
+  
+                state = self.state[p]
+  
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)
+  
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+  
+                state['step'] += 1
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+  
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+                p.data.add_(torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom), alpha=-step_size)
+  
+        return loss
+
+  
+class Novograd(Optimizer):
+    """
+    Implements Novograd algorithm.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.95, 0))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        grad_averaging: gradient averaging
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8,
+                 weight_decay=0, grad_averaging=False, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                      weight_decay=weight_decay,
+                      grad_averaging=grad_averaging,
+                      amsgrad=amsgrad)
+
+        super(Novograd, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Novograd, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Sparse gradients are not supported.')
+                amsgrad = group['amsgrad']
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                norm = torch.sum(torch.pow(grad, 2))
+
+                if exp_avg_sq == 0:
+                    exp_avg_sq.copy_(norm)
+                else:
+                    exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2)
+
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                grad.div_(denom)
+                if group['weight_decay'] != 0:
+                    grad.add_(p.data, alpha=group['weight_decay'])
+                if group['grad_averaging']:
+                    grad.mul_(1 - beta1)
+                exp_avg.mul_(beta1).add_(grad)
+
+                p.data.add_(exp_avg, alpha=-group['lr'])
+        
+        return loss
diff --git a/benchmarks/rnnt/ootb/train/common/rnn.py b/benchmarks/rnnt/ootb/train/common/rnn.py
new file mode 100644
index 0000000..6cf3dcc
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/rnn.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+from torch.nn import Parameter
+from mlperf import logging
+
+
+def rnn(input_size, hidden_size, num_layers,
+        forget_gate_bias=1.0, dropout=0.0, mlperf=False,
+        **kwargs):
+
+    return LSTM(
+        input_size=input_size,
+        hidden_size=hidden_size,
+        num_layers=num_layers,
+        dropout=dropout,
+        forget_gate_bias=forget_gate_bias,
+        mlperf=mlperf,
+        **kwargs,
+    )
+
+
+class LSTM(torch.nn.Module):
+
+    def __init__(self, input_size, hidden_size, num_layers, dropout,
+                 forget_gate_bias, mlperf, weights_init_scale=1.0,
+                 hidden_hidden_bias_scale=0.0, **kwargs):
+        """Returns an LSTM with forget gate bias init to `forget_gate_bias`.
+
+        Args:
+            input_size: See `torch.nn.LSTM`.
+            hidden_size: See `torch.nn.LSTM`.
+            num_layers: See `torch.nn.LSTM`.
+            dropout: See `torch.nn.LSTM`.
+            forget_gate_bias: For each layer and each direction, the total value of
+                to initialise the forget gate bias to.
+
+        Returns:
+            A `torch.nn.LSTM`.
+        """
+        super(LSTM, self).__init__()
+
+        self.lstm = torch.nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+        )
+
+        self.dropout = torch.nn.Dropout(dropout) if dropout else None
+
+        if forget_gate_bias is not None:
+            for name, v in self.lstm.named_parameters():
+                if "bias_ih" in name:
+                    bias = getattr(self.lstm, name)
+                    bias.data[hidden_size:2 * hidden_size].fill_(forget_gate_bias)
+                if "bias_hh" in name:
+                    bias = getattr(self.lstm, name)
+                    bias.data[hidden_size:2 * hidden_size] *= float(hidden_hidden_bias_scale)
+
+        for name, v in self.named_parameters():
+            if 'weight' in name or 'bias' in name:
+                v.data *= float(weights_init_scale)
+        tensor_name = kwargs['tensor_name']
+        if mlperf:
+            logging.log_event(logging.constants.WEIGHTS_INITIALIZATION,
+                              metadata=dict(tensor=tensor_name))
+
+    def forward(self, x, h=None):
+        x, h = self.lstm(x, h)
+        if self.dropout:
+            x = self.dropout(x)
+        return x, h
diff --git a/benchmarks/rnnt/ootb/train/common/sampler.py b/benchmarks/rnnt/ootb/train/common/sampler.py
new file mode 100644
index 0000000..7978480
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/sampler.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+
+from torch.utils.data.sampler import Sampler
+
+
+class DistributedSampler(Sampler):
+    def __init__(self, dataset, batch_size, world_size, rank):
+        """
+        Constructor for the DistributedSampler.
+        :param dataset: dataset
+        :param batch_size: local batch size
+        :param world_size: number of distributed workers
+        :param rank: rank of the current process
+        """
+        self.dataset = dataset
+        self.world_size = world_size
+        self.rank = rank
+        self.epoch = 0
+
+        self.batch_size = batch_size
+        self.global_batch_size = batch_size * world_size
+
+        self.data_len = len(self.dataset)
+
+        self.num_samples = self.data_len // self.global_batch_size \
+            * self.global_batch_size
+
+    def distribute_batches(self, indices):
+        """
+        Assigns batches to workers.
+        Consecutive ranks are getting consecutive batches.
+        :param indices: torch.tensor with batch indices
+        """
+        assert len(indices) == self.num_samples
+
+        indices = indices.view(-1, self.batch_size)
+        indices = indices[self.rank::self.world_size].contiguous()
+        indices = indices.view(-1)
+        indices = indices.tolist()
+
+        assert len(indices) == self.num_samples // self.world_size
+        return indices
+
+    def reshuffle_batches(self, indices, rng):
+        """
+        Permutes global batches
+        :param indices: torch.tensor with batch indices
+        :param rng: instance of torch.Generator
+        """
+        indices = indices.view(-1, self.global_batch_size)
+        num_batches = indices.shape[0]
+        order = torch.randperm(num_batches, generator=rng)
+        indices = indices[order, :]
+        indices = indices.view(-1)
+        return indices
+
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        # generate permutation
+        indices = torch.randperm(self.data_len, generator=rng)
+
+        # make indices evenly divisible by (batch_size * world_size)
+        indices = indices[:self.num_samples]
+
+        # assign batches to workers
+        indices = self.distribute_batches(indices)
+        return iter(indices)
+
+    def set_epoch(self, epoch):
+        """
+        Sets current epoch index.
+        Epoch index is used to seed RNG in __iter__() function.
+        :param epoch: index of current epoch
+        """
+        self.epoch = epoch
+
+    def __len__(self):
+        return self.num_samples // self.world_size
+
+
+class BucketingSampler(DistributedSampler):
+    def __init__(self, dataset, batch_size, num_buckets, world_size, rank):
+        """
+        Bucketing sampler with approx. equally-sized buckets.
+        :param dataset: dataset
+        :param batch_size: local batch size
+        :param seeds: list of seeds, one seed for each training epoch
+        :param num_buckets: number of buckets
+        :param world_size: number of distributed workers
+        :param rank: rank of the current process
+        """
+        super().__init__(dataset, batch_size, world_size, rank)
+
+        self.num_buckets = num_buckets
+        len_ids = np.argsort([sample['duration'] for sample in dataset.samples])
+        self.buckets = [torch.from_numpy(t)
+                        for t in np.array_split(len_ids, num_buckets)]
+        global_bs = self.global_batch_size
+
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        global_bsz = self.global_batch_size
+
+        indices = []
+        for bid in range(self.num_buckets):
+            # random shuffle within current bucket
+            perm = torch.randperm(len(self.buckets[bid]), generator=g)
+            bucket_indices = self.buckets[bid][perm]
+
+            # add samples from current bucket to indices for current epoch
+            indices.append(bucket_indices)
+
+        indices = torch.cat(indices)
+
+        # make indices evenly divisible by global batch size
+        length = len(indices) // global_bsz * global_bsz
+        indices = indices[:length]
+
+        assert len(indices) % self.global_batch_size == 0
+
+        # perform global reshuffle of all global batches
+        indices = self.reshuffle_batches(indices, g)
+        # distribute batches to individual workers
+        indices = self.distribute_batches(indices)
+        return iter(indices)
+
diff --git a/benchmarks/rnnt/ootb/train/common/tb_dllogger.py b/benchmarks/rnnt/ootb/train/common/tb_dllogger.py
new file mode 100644
index 0000000..2ffb4bf
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/tb_dllogger.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import atexit
+import glob
+import os
+import re
+import numpy as np
+
+import dllogger
+import torch
+import torch.distributed as dist
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+from torch.utils.tensorboard import SummaryWriter
+
+from . import helpers
+
+
+tb_loggers = {}
+
+
+class TBLogger:
+    """
+    xyz_dummies: stretch the screen with empty plots so the legend would
+                 always fit for other plots
+    """
+    def __init__(self, enabled, log_dir, name, interval=1, dummies=True):
+        self.enabled = enabled
+        self.interval = interval
+        self.cache = {}
+        if self.enabled:
+            self.summary_writer = SummaryWriter(
+                log_dir=os.path.join(log_dir, name),
+                flush_secs=120, max_queue=200)
+            atexit.register(self.summary_writer.close)
+            if dummies:
+                for key in ('aaa', 'zzz'):
+                    self.summary_writer.add_scalar(key, 0.0, 1)
+
+    def log(self, step, data):
+        for k, v in data.items():
+            self.log_value(step, k, v.item() if type(v) is torch.Tensor else v)
+
+    def log_value(self, step, key, val, stat='mean'):
+        if self.enabled:
+            if key not in self.cache:
+                self.cache[key] = []
+            self.cache[key].append(val)
+            if len(self.cache[key]) == self.interval:
+                agg_val = getattr(np, stat)(self.cache[key])
+                self.summary_writer.add_scalar(key, agg_val, step)
+                del self.cache[key]
+
+    def log_grads(self, step, model):
+        if self.enabled:
+            norms = [p.grad.norm().item() for p in model.parameters()
+                     if p.grad is not None]
+            for stat in ('max', 'min', 'mean'):
+                self.log_value(step, f'grad_{stat}', getattr(np, stat)(norms),
+                               stat=stat)
+
+
+def unique_log_fpath(log_fpath):
+
+    if not os.path.isfile(log_fpath):
+        return log_fpath
+
+    # Avoid overwriting old logs
+    saved = sorted([int(re.search('\.(\d+)', f).group(1))
+                    for f in glob.glob(f'{log_fpath}.*')])
+
+    log_num = (saved[-1] if saved else 0) + 1
+    return f'{log_fpath}.{log_num}'
+
+
+def stdout_step_format(step):
+    if isinstance(step, str):
+        return step
+    fields = []
+    if len(step) > 0:
+        fields.append("epoch {:>4}".format(step[0]))
+    if len(step) > 1:
+        fields.append("iter {:>4}".format(step[1]))
+    if len(step) > 2:
+        fields[-1] += "/{}".format(step[2])
+    return " | ".join(fields)
+
+
+def stdout_metric_format(metric, metadata, value):
+    name = metadata.get("name", metric + " : ")
+    unit = metadata.get("unit", None)
+    format = f'{{{metadata.get("format", "")}}}'
+    fields = [name, format.format(value) if value is not None else value, unit]
+    fields = [f for f in fields if f is not None]
+    return "| " + " ".join(fields)
+
+
+def init_log(args):
+    
+    enabled = not dist.is_initialized() or dist.get_rank() == 0
+    if enabled:
+        fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json')
+        backends = [JSONStreamBackend(Verbosity.DEFAULT,
+                                      unique_log_fpath(fpath)),
+                    StdOutBackend(Verbosity.VERBOSE,
+                                  step_format=stdout_step_format,
+                                  metric_format=stdout_metric_format)]
+    else:
+        backends = []
+
+    dllogger.init(backends=backends)
+    dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})
+
+    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
+                      ('dev_ema', '  dev ema ')]:
+
+        dllogger.metadata(f"{id_}_loss",
+                          {"name": f"{pref}loss", "format": ":>7.2f"})
+
+        dllogger.metadata(f"{id_}_wer",
+                          {"name": f"{pref}wer", "format": ":>6.2f"})
+
+        dllogger.metadata(f"{id_}_pplx",
+                          {"name": f"{pref}pplx", "format": ":>6.2f"})
+
+        dllogger.metadata(f"{id_}_throughput",
+                          {"name": f"{pref}utts/s", "format": ":>5.0f"})
+
+        dllogger.metadata(f"{id_}_took",
+                          {"name": "took", "unit": "s", "format": ":>5.2f"})
+
+    tb_subsets = ['train', 'dev_ema']
+    global tb_loggers
+    tb_loggers = {s: TBLogger(enabled, args.output_dir, name=s)
+                  for s in tb_subsets}
+
+    log_parameters(vars(args), tb_subset='train')
+
+
+def log(step, tb_total_steps=None, subset='train', data={}):
+
+    if tb_total_steps is not None:
+        tb_loggers[subset].log(tb_total_steps, data)
+
+    if subset != '':
+        data = {f'{subset}_{key}': v for key,v in data.items()}
+    dllogger.log(step, data=data)
+
+
+def log_grads_tb(tb_total_steps, grads, tb_subset='train'):
+    tb_loggers[tb_subset].log_grads(tb_total_steps, grads)
+
+
+def log_parameters(data, verbosity=0, tb_subset=None):
+    for k,v in data.items():
+        dllogger.log(step="PARAMETER", data={k:v}, verbosity=verbosity)
+
+    if tb_subset is not None and tb_loggers[tb_subset].enabled:
+        tb_data = {k:v for k,v in data.items()
+                   if type(v) in (str, bool, int, float)}
+        tb_loggers[tb_subset].summary_writer.add_hparams(tb_data, {})
+
+
+def flush_log():
+    dllogger.flush()
+    for tbl in tb_loggers.values():
+        if tbl.enabled:
+            tbl.summary_writer.flush()
diff --git a/benchmarks/rnnt/ootb/train/common/text/LICENSE b/benchmarks/rnnt/ootb/train/common/text/LICENSE
new file mode 100644
index 0000000..4ad4ed1
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/text/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2017 Keith Ito
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/benchmarks/rnnt/ootb/train/common/text/__init__.py b/benchmarks/rnnt/ootb/train/common/text/__init__.py
new file mode 100644
index 0000000..4901823
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/text/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2017 Keith Ito
+""" from https://github.com/keithito/tacotron """
+import re
+import string
+from . import cleaners
+
+def _clean_text(text, cleaner_names, *args):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception('Unknown cleaner: %s' % name)
+        text = cleaner(text, *args)
+    return text
+
+
+def punctuation_map(labels):
+    # Punctuation to remove
+    punctuation = string.punctuation
+    punctuation = punctuation.replace("+", "")
+    punctuation = punctuation.replace("&", "")
+    # TODO We might also want to consider:
+    # @ -> at
+    # # -> number, pound, hashtag
+    # ~ -> tilde
+    # _ -> underscore
+    # % -> percent
+    # If a punctuation symbol is inside our vocab, we do not remove from text
+    for l in labels:
+        punctuation = punctuation.replace(l, "")
+    # Turn all punctuation to whitespace
+    table = str.maketrans(punctuation, " " * len(punctuation))
+    return table
diff --git a/benchmarks/rnnt/ootb/train/common/text/cleaners.py b/benchmarks/rnnt/ootb/train/common/text/cleaners.py
new file mode 100644
index 0000000..08ef5d8
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/text/cleaners.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2017 Keith Ito
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" from https://github.com/keithito/tacotron 
+Modified to add puncturation removal
+"""
+
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+    1. "english_cleaners" for English text
+    2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+         the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+    3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+         the symbols in symbols.py to match your data).
+
+'''
+
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+def lowercase(text):
+    return text.lower()
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+
+def convert_to_ascii(text):
+    return unidecode(text)
+
+def remove_punctuation(text, table):
+    text = text.translate(table)
+    text = re.sub(r'&', " and ", text)
+    text = re.sub(r'\+', " plus ", text)
+    return text
+
+def english_cleaners(text, table=None):
+    '''Pipeline for English text, including number and abbreviation expansion.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    if table is not None:
+        text = remove_punctuation(text, table)
+    text = collapse_whitespace(text)
+    return text
diff --git a/benchmarks/rnnt/ootb/train/common/text/numbers.py b/benchmarks/rnnt/ootb/train/common/text/numbers.py
new file mode 100644
index 0000000..46ce110
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/text/numbers.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2017 Keith Ito
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" from https://github.com/keithito/tacotron 
+Modifed to add support for time and slight tweaks to _expand_number
+"""
+
+import inflect
+import re
+
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+_time_re = re.compile(r'([0-9]{1,2}):([0-9]{2})')
+
+
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    if int(m.group(0)[0]) == 0:
+        return _inflect.number_to_words(m.group(0), andword='', group=1)
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+    # Add check for number phones and other large numbers
+    elif num > 1000000000 and num % 10000 != 0:
+        return _inflect.number_to_words(num, andword='', group=1)
+    else:
+        return _inflect.number_to_words(num, andword='')
+
+def _expand_time(m):
+    mins = int(m.group(2))
+    if mins == 0:
+        return _inflect.number_to_words(m.group(1))
+    return " ".join([_inflect.number_to_words(m.group(1)), _inflect.number_to_words(m.group(2))])
+
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    text = re.sub(_time_re, _expand_time, text)
+    return text
diff --git a/benchmarks/rnnt/ootb/train/common/text/symbols.py b/benchmarks/rnnt/ootb/train/common/text/symbols.py
new file mode 100644
index 0000000..24efedf
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/common/text/symbols.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2017 Keith Ito
+""" from https://github.com/keithito/tacotron """
+
+'''
+Defines the set of symbols used in text input to the model.
+
+The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
+from . import cmudict
+
+_pad        = '_'
+_punctuation = '!\'(),.:;? '
+_special = '-'
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+_arpabet = ['@' + s for s in cmudict.valid_symbols]
+
+# Export all symbols:
+symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
diff --git a/benchmarks/rnnt/ootb/train/configs/baseline_v3-1023sp.yaml b/benchmarks/rnnt/ootb/train/configs/baseline_v3-1023sp.yaml
new file mode 100644
index 0000000..fb05553
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/configs/baseline_v3-1023sp.yaml
@@ -0,0 +1,80 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# ~5.8% WER on dev-clean after 100epochs with LR 1e-3
+#
+
+tokenizer:
+    sentpiece_model: <your $DATASET_DIR>/sentencepieces/librispeech1023.model
+    labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
+             "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+
+input_val:
+  audio_dataset: &val_dataset
+    sample_rate: &sample_rate 16000
+    trim_silence: true
+    normalize_transcripts: true
+
+  filterbank_features: &val_features
+    normalize: per_feature
+    sample_rate: *sample_rate
+    window_size: 0.02
+    window_stride: 0.01
+    window: hann
+    n_fft: 512
+    n_filt: &n_filt 80
+    dither: 0.00001
+  frame_splicing: &val_splicing
+    frame_stacking: 3
+    frame_subsampling: 3
+
+# For training we keep samples < 16.7s and apply augmentation
+input_train:
+  audio_dataset:
+    <<: *val_dataset
+    max_duration: 16.7
+    speed_perturbation:
+        min_rate: 0.85
+        max_rate: 1.15
+        p: 1.0
+
+  filterbank_features: *val_features
+  frame_splicing: *val_splicing
+
+  spec_augment:
+    freq_masks: 2
+    min_freq: 0
+    max_freq: 20
+    time_masks: 10
+    min_time: 0
+    max_time: 0.03
+
+rnnt:
+  in_feats: 240  # n_filt x frame_stacking
+
+  enc_n_hid: 1024
+  enc_pre_rnn_layers: 2
+  enc_post_rnn_layers: 3
+  enc_stack_time_factor: 2
+  enc_dropout: 0.1
+
+  pred_n_hid: 512
+  pred_rnn_layers: 2
+  pred_dropout: 0.3
+
+  joint_n_hid: 512
+  joint_dropout: 0.3
+
+  forget_gate_bias: 1.0
diff --git a/benchmarks/rnnt/ootb/train/docker-compose.yaml b/benchmarks/rnnt/ootb/train/docker-compose.yaml
new file mode 100644
index 0000000..21b2eeb
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/docker-compose.yaml
@@ -0,0 +1,17 @@
+version: '3.3'
+services:
+    test:
+        deploy:
+            resources:
+                reservations:
+                    devices:
+                        - capabilities:
+                            - gpu
+        build:
+            context: .
+            dockerfile: tests/Dockerfile
+        volumes:
+            - .:/code
+            - /mnt/mwawrzos/storage/datasets/LibriSpeech/LibriSpeech:/datasets/LibriSpeech
+        stdin_open: true
+        tty: true
diff --git a/benchmarks/rnnt/ootb/train/eval_model.py b/benchmarks/rnnt/ootb/train/eval_model.py
new file mode 100644
index 0000000..44d4da0
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/eval_model.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import torch
+from torch.autograd import Variable
+from warpctc_pytorch import CTCLoss
+
+import torch.nn.functional as F
+
+import sys
+### Import Data Utils ###
+sys.path.append('../')
+
+from data.bucketing_sampler import BucketingSampler, SpectrogramDatasetWithLength
+from data.data_loader import AudioDataLoader, SpectrogramDataset
+from decoder import GreedyDecoder
+from model import DeepSpeech, supported_rnns
+from params import cuda
+
+def eval_model(model, test_loader, decoder):
+        start_iter = 0  # Reset start iteration for next epoch
+        total_cer, total_wer = 0, 0
+        model.eval()
+        for i, (data) in enumerate(test_loader):  # test
+            inputs, targets, input_percentages, target_sizes = data
+
+            inputs = Variable(inputs, volatile=True)
+
+            # unflatten targets
+            split_targets = []
+            offset = 0
+            for size in target_sizes:
+                split_targets.append(targets[offset:offset + size])
+                offset += size
+
+            if cuda:
+                inputs = inputs.cuda()
+
+            out = model(inputs)
+            out = out.transpose(0, 1)  # TxNxH
+            seq_length = out.size(0)
+            sizes = input_percentages.mul_(int(seq_length)).int()
+
+            decoded_output = decoder.decode(out.data, sizes)
+            target_strings = decoder.process_strings(decoder.convert_to_strings(split_targets))
+            wer, cer = 0, 0
+            for x in range(len(target_strings)):
+                wer += decoder.wer(decoded_output[x], target_strings[x]) / float(len(target_strings[x].split()))
+                cer += decoder.cer(decoded_output[x], target_strings[x]) / float(len(target_strings[x]))
+            total_cer += cer
+            total_wer += wer
+
+            if cuda:
+                torch.cuda.synchronize()
+            del out
+        wer = total_wer / len(test_loader.dataset)
+        cer = total_cer / len(test_loader.dataset)
+        wer *= 100
+        cer *= 100
+
+        return wer, cer
diff --git a/benchmarks/rnnt/ootb/train/inference.py b/benchmarks/rnnt/ootb/train/inference.py
new file mode 100644
index 0000000..f6e3c95
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/inference.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import math
+import os
+import random
+import time
+import tqdm
+from heapq import nlargest
+from itertools import chain, repeat
+from pathlib import Path
+
+import dllogger
+import torch
+import numpy as np
+import torch.distributed as distrib
+from apex import amp
+from apex.parallel import DistributedDataParallel
+from dllogger import JSONStreamBackend, StdOutBackend, Verbosity
+
+from common import helpers
+from common.data import features
+from common.data.dali import sampler as dali_sampler
+from common.data.dali.data_loader import DaliDataLoader
+from common.data.features import BaseFeatures, FilterbankFeatures
+from common.data.text import Tokenizer
+from common.helpers import print_once, process_evaluation_epoch
+from common.tb_dllogger import stdout_metric_format, unique_log_fpath
+from rnnt import config
+from rnnt.decoder import RNNTGreedyDecoder
+from rnnt.model import RNNT
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description='RNN-T')
+    parser.add_argument('--batch_size', default=16, type=int,
+                        help='Data batch size')
+    parser.add_argument('--steps', default=0, type=int,
+                        help='Eval this many steps for every worker')
+    parser.add_argument('--model_config', type=str,
+                        help='Relative model config path given dataset folder')
+    parser.add_argument('--dataset_dir', type=str,
+                        help='Absolute path to dataset folder')
+    parser.add_argument('--val_manifest', type=str,
+                        help='Relative path to evaluation dataset manifest file')
+    parser.add_argument('--ckpt', default=None, type=str,
+                        help='Path to model checkpoint')
+    parser.add_argument('--max_duration', default=None, type=float,
+                        help='Filter out longer inputs (in seconds)')
+    parser.add_argument('--pad_to_max_duration', action='store_true',
+                        help='Pads every batch to max_duration')
+    parser.add_argument('--amp', '--fp16', action='store_true',
+                        help='Use FP16 precision')
+    parser.add_argument('--cudnn_benchmark', action='store_true',
+                        help='Enable cudnn benchmark')
+    parser.add_argument('--save_predictions', type=str, default=None,
+                        help='Save predictions in text form at this location')
+    parser.add_argument('--transcribe_wav', type=str,
+                        help='Path to a single .wav file (16KHz)')
+    parser.add_argument('--transcribe_filelist', type=str,
+                        help='Path to a filelist with one .wav path per line')
+    parser.add_argument('--dali_device', type=str, choices=['none', 'cpu', 'gpu'],
+                        default='gpu', help='')  # XXX
+    parser.add_argument('--repeats', default=1, type=int,
+                        help='Repeat the inference for benchmarking')
+
+    parser.add_argument('-o', '--output_dir', default='results/',
+                        help='Output folder to save audio (file per phrase)')
+    parser.add_argument('--log_file', type=str, default=None,
+                        help='Path to a DLLogger log file')
+    parser.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0),
+                        type=int, help='GPU id used for distributed training')
+
+    parser.add_argument('--cpu', action='store_true',
+                        help='Run inference on CPU')
+    parser.add_argument('--ema', action='store_true',
+                        help='Load EMA model weights')
+
+    parser.add_argument("--seed", default=None, type=int, help='seed')
+
+    return parser
+
+
+def durs_to_percentiles(durations, ratios):
+    durations = np.asarray(durations) * 1000  # in ms
+    latency = durations
+
+    latency = latency[5:]
+    mean_latency = np.mean(latency)
+
+    latency_worst = nlargest(math.ceil((1 - min(ratios))* len(latency)), latency)
+    latency_ranges = get_percentile(ratios, latency_worst, len(latency))
+    latency_ranges[0.5] = mean_latency
+    return latency_ranges
+
+
+def get_percentile(ratios, arr, nsamples):
+    res = {}
+    for a in ratios:
+        idx = max(int(nsamples * (1 - a)), 0)
+        res[a] = arr[idx]
+    return res
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json'))
+    log_fpath = unique_log_fpath(log_fpath)
+    dllogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
+                            StdOutBackend(Verbosity.VERBOSE,
+                                          metric_format=stdout_metric_format)])
+
+    [dllogger.log("PARAMETER", {k:v}) for k,v in vars(args).items()]
+
+    for step in ['DNN', 'data+DNN', 'data']:
+        for c in [0.99, 0.95, 0.9, 0.5]:
+            cs = 'avg' if c == 0.5 else f'{int(100*c)}%'
+            dllogger.metadata(f'{step.lower()}_latency_{c}',
+                              {'name': f'{step} latency {cs}',
+                               'format': ':>7.2f', 'unit': 'ms'})
+    dllogger.metadata(
+        'eval_wer', {'name': 'WER', 'format': ':>3.3f', 'unit': '%'})
+
+    if args.cpu:
+        device = torch.device('cpu')
+    else:
+        assert torch.cuda.is_available()
+        device = torch.device('cuda')
+        torch.backends.cudnn.benchmark = args.cudnn_benchmark
+
+    if args.seed is not None:
+        torch.manual_seed(args.seed + args.local_rank)
+        np.random.seed(args.seed + args.local_rank)
+        random.seed(args.seed + args.local_rank)
+
+    # set up distributed training
+    multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1
+    if multi_gpu:
+        torch.cuda.set_device(args.local_rank)
+        distrib.init_process_group(backend='nccl', init_method='env://')
+        print_once(f'Inference with {distrib.get_world_size()} GPUs')
+
+    cfg = config.load(args.model_config)
+
+    if args.max_duration is not None:
+        cfg['input_val']['audio_dataset']['max_duration'] = args.max_duration
+        cfg['input_val']['filterbank_features']['max_duration'] = args.max_duration
+
+    if args.pad_to_max_duration:
+        assert cfg['input_val']['audio_dataset']['max_duration'] > 0
+        cfg['input_val']['audio_dataset']['pad_to_max_duration'] = True
+        cfg['input_val']['filterbank_features']['pad_to_max_duration'] = True
+
+    use_dali = args.dali_device in ('cpu', 'gpu')
+
+    (
+        dataset_kw,
+        features_kw,
+        splicing_kw,
+        _, _
+    ) = config.input(cfg, 'val')
+
+    tokenizer_kw = config.tokenizer(cfg)
+    tokenizer = Tokenizer(**tokenizer_kw)
+
+    optim_level = 3 if args.amp else 0
+
+    feature_proc  = torch.nn.Sequential(
+        torch.nn.Identity(),
+        torch.nn.Identity(),
+        features.FrameSplicing(optim_level=optim_level, **splicing_kw),
+        features.FillPadding(optim_level=optim_level, ),
+    )
+
+    # dataset
+
+    data_loader = DaliDataLoader(
+        gpu_id=args.local_rank or 0,
+        dataset_path=args.dataset_dir,
+        config_data=dataset_kw,
+        config_features=features_kw,
+        json_names=[args.val_manifest],
+        batch_size=args.batch_size,
+        sampler=dali_sampler.SimpleSampler(),
+        pipeline_type="val",
+        device_type=args.dali_device,
+        tokenizer=tokenizer)
+
+    model = RNNT(n_classes=tokenizer.num_labels + 1, **config.rnnt(cfg))
+
+    if args.ckpt is not None:
+        print(f'Loading the model from {args.ckpt} ...')
+        checkpoint = torch.load(args.ckpt, map_location="cpu")
+        key = 'ema_state_dict' if args.ema else 'state_dict'
+        state_dict = checkpoint[key]
+        model.load_state_dict(state_dict, strict=True)
+
+    model.to(device)
+    model.eval()
+
+    if feature_proc is not None:
+        feature_proc.to(device)
+        feature_proc.eval()
+
+    if args.amp:
+        model = amp.initialize(model, opt_level='O3')
+
+    if multi_gpu:
+        model = DistributedDataParallel(model)
+
+    agg = {'txts': [], 'preds': [], 'logits': []}
+    dur = {'data': [], 'dnn': [], 'data+dnn': []}
+
+    rep_loader = chain(*repeat(data_loader, args.repeats))
+    rep_len = args.repeats * len(data_loader)
+
+    blank_idx = tokenizer.num_labels
+    greedy_decoder = RNNTGreedyDecoder(blank_idx=blank_idx)
+
+    def sync_time():
+        torch.cuda.synchronize() if device.type == 'cuda' else None
+        return time.perf_counter()
+
+    sz = []
+    with torch.no_grad():
+
+        for it, batch in enumerate(tqdm.tqdm(rep_loader, total=rep_len)):
+
+            if use_dali:
+                feats, feat_lens, txt, txt_lens = batch
+                if feature_proc is not None:
+                    feats, feat_lens = feature_proc([feats, feat_lens])
+            else:
+                batch = [t.cuda(non_blocking=True) for t in batch]
+                audio, audio_lens, txt, txt_lens = batch
+                feats, feat_lens = feature_proc([audio, audio_lens])
+            feats = feats.permute(2, 0, 1)
+            if args.amp:
+                feats = feats.half()
+
+            sz.append(feats.size(0))
+
+            t1 = sync_time()
+            log_probs, log_prob_lens = model(feats, feat_lens, txt, txt_lens)
+            t2 = sync_time()
+
+            # burn-in period; wait for a new loader due to num_workers
+            if it >= 1 and (args.repeats == 1 or it >= len(data_loader)):
+                dur['data'].append(t1 - t0)
+                dur['dnn'].append(t2 - t1)
+                dur['data+dnn'].append(t2 - t0)
+
+            if txt is not None:
+                agg['txts'] += helpers.gather_transcripts([txt], [txt_lens],
+                                                          tokenizer.detokenize)
+
+            preds = greedy_decoder.decode(model, feats, feat_lens)
+
+            agg['preds'] += helpers.gather_predictions([preds], tokenizer.detokenize)
+
+            if 0 < args.steps < it:
+                break
+
+            t0 = sync_time()
+
+        # communicate the results
+        if args.transcribe_wav:
+            for idx,p in enumerate(agg['preds']):
+                print_once(f'Prediction {idx+1: >3}: {p}')
+
+        elif args.transcribe_filelist:
+            pass
+
+        else:
+            wer, loss = process_evaluation_epoch(agg)
+
+            if not multi_gpu or distrib.get_rank() == 0:
+                dllogger.log(step=(), data={'eval_wer': 100 * wer})
+
+        if args.save_predictions:
+            with open(args.save_predictions, 'w') as f:
+                f.write('\n'.join(agg['preds']))
+
+    # report timings
+    if len(dur['data']) >= 20:
+        ratios = [0.9, 0.95, 0.99]
+
+        for stage in dur:
+            lat = durs_to_percentiles(dur[stage], ratios)
+            for k in [0.99, 0.95, 0.9, 0.5]:
+                kk = str(k).replace('.', '_')
+                dllogger.log(step=(), data={f'{stage.lower()}_latency_{kk}': lat[k]})
+
+    else:
+        # TODO measure at least avg latency
+        print_once('Not enough samples to measure latencies.')
+
+
+if __name__=="__main__":
+    main()
diff --git a/benchmarks/rnnt/ootb/train/mlperf/__init__.py b/benchmarks/rnnt/ootb/train/mlperf/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/rnnt/ootb/train/mlperf/logging.py b/benchmarks/rnnt/ootb/train/mlperf/logging.py
new file mode 100644
index 0000000..74e803f
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/mlperf/logging.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+from mlperf_logging import mllog
+from mlperf_logging.mllog import constants
+
+
+mllogger = mllog.get_mllogger()
+
+
+def configure_logger(benchmark):
+    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
+    mllogger = mllog.get_mllogger()
+    mllogger.logger.propagate = False
+
+
+def log_start(*args, **kwargs):
+    _log(mllogger.start, *args, **kwargs)
+def log_end(*args, **kwargs):
+    _log(mllogger.end, *args, **kwargs)
+def log_event(*args, **kwargs):
+    _log(mllogger.event, *args, **kwargs)
+
+def _log(logger, *args, **kwargs):
+    """
+    Wrapper for MLPerf compliance logging calls.
+    All arguments but 'sync' and 'log_all_ranks' are passed to
+    mlperf_logging.mllog.
+    If 'sync' is set to True then the wrapper will synchronize all distributed
+    workers. 'sync' should be set to True for all compliance tags that require
+    accurate timing (RUN_START, RUN_STOP etc.)
+    If 'log_all_ranks' is set to True then all distributed workers will print
+    logging message, if set to False then only worker with rank=0 will print
+    the message.
+    """
+    if 'stack_offset' not in kwargs:
+        kwargs['stack_offset'] = 3
+    if 'value' not in kwargs:
+        kwargs['value'] = None
+
+    if kwargs.pop('log_all_ranks', False):
+        log = True
+    else:
+        log = (get_rank() == 0)
+
+    if log:
+        logger(*args, **kwargs)
+
+
+def get_rank():
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
+
diff --git a/benchmarks/rnnt/ootb/train/requirements.txt b/benchmarks/rnnt/ootb/train/requirements.txt
new file mode 100755
index 0000000..7318388
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/requirements.txt
@@ -0,0 +1,10 @@
+https://github.com/NVIDIA/dllogger/archive/26a0f8f1958de2c0c460925ff6102a4d2486d6cc.zip
+https://github.com/mlcommons/logging/archive/d08740cadb4188a5ebeb84ad6c68f98c1e129805.zip
+tensorboard==2.3.0
+unidecode==1.1.1
+inflect==4.1.0
+soundfile==0.10.3.post1
+librosa==0.8.0
+sox==1.4.1
+sentencepiece==0.1.94
+pandas==1.1.5
diff --git a/benchmarks/rnnt/ootb/train/rnnt/config.py b/benchmarks/rnnt/ootb/train/rnnt/config.py
new file mode 100644
index 0000000..35d0a65
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/rnnt/config.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import yaml
+
+from common.data.dali.pipeline import PipelineParams, SpeedPerturbationParams
+from common.data.text import Tokenizer
+from common.data import features
+from common.helpers import print_once
+from .model import RNNT
+
+
+def default_args(klass):
+    sig = inspect.signature(klass.__init__)
+    return {k: v.default for k,v in sig.parameters.items() if k != 'self'}
+
+
+def load(fpath, max_duration=None):
+
+    if fpath.endswith('.toml'):
+        raise ValueError('.toml config format has been changed to .yaml')
+
+    cfg = yaml.safe_load(open(fpath, 'r'))
+
+    # Reload to deep copy shallow copies, which were made with yaml anchors
+    yaml.Dumper.ignore_aliases = lambda *args: True
+    cfg = yaml.safe_load(yaml.dump(cfg))
+
+    # Modify the config with supported cmdline flags
+    if max_duration is not None:
+        cfg['input_train']['audio_dataset']['max_duration'] = max_duration
+        cfg['input_train']['filterbank_features']['max_duration'] = max_duration
+
+    return cfg
+
+
+def validate_and_fill(klass, user_conf, ignore=[], optional=[]):
+    conf = default_args(klass)
+
+    for k,v in user_conf.items():
+        assert k in conf or k in ignore, f'Unknown parameter {k} for {klass}'
+        conf[k] = v
+
+    # Keep only mandatory or optional-nonempty
+    conf = {k:v for k,v in conf.items()
+            if k not in optional or v is not inspect.Parameter.empty}
+
+    # Validate
+    for k,v in conf.items():
+        assert v is not inspect.Parameter.empty, \
+            f'Value for {k} not specified for {klass}'
+    return conf
+
+
+def input(conf_yaml, split='train'):
+
+    conf = copy.deepcopy(conf_yaml[f'input_{split}'])
+    conf_dataset = conf.pop('audio_dataset')
+    conf_features = conf.pop('filterbank_features')
+    conf_splicing = conf.pop('frame_splicing', {})
+    conf_specaugm = conf.pop('spec_augment', None)
+    conf_cutoutau = conf.pop('cutout_augment', None)
+
+    # Validate known inner classes
+    inner_classes = [
+        (conf_dataset, 'speed_perturbation', SpeedPerturbationParams),
+    ]
+    amp=['optim_level']
+    for conf_tgt, key, klass in inner_classes:
+        if key in conf_tgt:
+            conf_tgt[key] = validate_and_fill(klass, conf_tgt[key], optional=amp)
+
+    for k in conf:
+        raise ValueError(f'Unknown key {k}')
+
+    # Validate outer classes
+    conf_dataset = validate_and_fill(PipelineParams, conf_dataset)
+
+    conf_features = validate_and_fill(features.FilterbankFeatures, conf_features, optional=amp)
+    conf_splicing = validate_and_fill(features.FrameSplicing, conf_splicing, optional=amp)
+    conf_specaugm = conf_specaugm and validate_and_fill(features.SpecAugment, conf_specaugm, optional=amp)
+
+    # Check params shared between classes
+    for shared in ['sample_rate']:
+        assert conf_dataset[shared] == conf_features[shared], (
+            f'{shared} should match in Dataset and FeatureProcessor: '
+            f'{conf_dataset[shared]}, {conf_features[shared]}')
+
+    return conf_dataset, conf_features, conf_splicing, conf_specaugm
+
+
+def rnnt(conf):
+    return validate_and_fill(RNNT, conf['rnnt'], optional=['n_classes'])
+
+
+def tokenizer(conf):
+    return validate_and_fill(Tokenizer, conf['tokenizer'], optional=['sentpiece_model'])
+
+
+def apply_duration_flags(cfg, max_duration):
+    if max_duration is not None:
+        cfg['input_train']['audio_dataset']['max_duration'] = max_duration
+        cfg['input_train']['filterbank_features']['max_duration'] = max_duration
+
diff --git a/benchmarks/rnnt/ootb/train/rnnt/decoder.py b/benchmarks/rnnt/ootb/train/rnnt/decoder.py
new file mode 100644
index 0000000..8deb6fa
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/rnnt/decoder.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .model import label_collate
+
+
+class RNNTGreedyDecoder:
+    """A greedy transducer decoder.
+
+    Args:
+        blank_symbol: See `Decoder`.
+        model: Model to use for prediction.
+        max_symbols_per_step: The maximum number of symbols that can be added
+            to a sequence in a single time step; if set to None then there is
+            no limit.
+        cutoff_prob: Skip to next step in search if current highest character
+            probability is less than this.
+    """
+    def __init__(self, blank_idx, max_symbols_per_step=30, max_symbol_per_sample=None):
+        self.blank_idx = blank_idx
+        assert max_symbols_per_step is None or max_symbols_per_step > 0
+        self.max_symbols = max_symbols_per_step
+        assert max_symbol_per_sample is None or max_symbol_per_sample > 0
+        self.max_symbol_per_sample = max_symbol_per_sample
+        self._SOS = -1   # start of sequence
+
+    def _pred_step(self, model, label, hidden, device):
+        if label == self._SOS:
+            return model.predict(None, hidden, add_sos=False)
+
+        label = label_collate([[label]]).to(device)
+        return model.predict(label, hidden, add_sos=False)
+
+    def _joint_step(self, model, enc, pred, log_normalize=False):
+        logits = model.joint(enc, pred)[:, 0, 0, :]
+
+        if log_normalize:
+            probs = F.log_softmax(logits, dim=len(logits.shape) - 1)
+            return probs
+        else:
+            return logits
+
+    def decode(self, model, x, out_lens):
+        """Returns a list of sentences given an input batch.
+
+        Args:
+            x: A tensor of size (batch, channels, features, seq_len)
+            out_lens: list of int representing the length of each sequence
+                output sequence.
+
+        Returns:
+            list containing batch number of sentences (strings).
+        """
+        model = getattr(model, 'module', model)
+        with torch.no_grad():
+            # Apply optional preprocessing
+
+            logits, out_lens = model.encode(x, out_lens)
+
+            output = []
+            for batch_idx in range(logits.size(0)):
+                inseq = logits[batch_idx, :, :].unsqueeze(1)
+                logitlen = out_lens[batch_idx]
+                sentence = self._greedy_decode(model, inseq, logitlen)
+                output.append(sentence)
+
+        return output
+
+    def _greedy_decode(self, model, x, out_len):
+        training_state = model.training
+        model.eval()
+
+        device = x.device
+
+        hidden = None
+        label = []
+        for time_idx in range(out_len):
+            if  self.max_symbol_per_sample is not None \
+                and len(label) > self.max_symbol_per_sample:
+                break
+            f = x[time_idx, :, :].unsqueeze(0)
+
+            not_blank = True
+            symbols_added = 0
+
+            while not_blank and (
+                    self.max_symbols is None or
+                    symbols_added < self.max_symbols):
+                g, hidden_prime = self._pred_step(
+                    model,
+                    self._SOS if label == [] else label[-1],
+                    hidden,
+                    device
+                )
+                logp = self._joint_step(model, f, g, log_normalize=False)[0, :]
+
+                # get index k, of max prob
+                v, k = logp.max(0)
+                k = k.item()
+
+                if k == self.blank_idx:
+                    not_blank = False
+                else:
+                    label.append(k)
+                    hidden = hidden_prime
+                symbols_added += 1
+
+        model.train(training_state)
+        return label
+
diff --git a/benchmarks/rnnt/ootb/train/rnnt/loss.py b/benchmarks/rnnt/ootb/train/rnnt/loss.py
new file mode 100644
index 0000000..90e878c
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/rnnt/loss.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from warprnnt_pytorch import RNNTLoss as WarpRNNTLoss
+
+
+class RNNTLoss(torch.nn.Module):
+    """Wrapped :py:class:`warprnnt_pytorch.RNNTLoss`.
+    Args:
+        blank_idx: Index of the blank label.
+    Attributes:
+        rnnt_loss: A :py:class:`warprnnt_pytorch.RNNTLoss` instance.
+    """
+
+    def __init__(self, blank_idx):
+        super().__init__()
+        self.rnnt_loss = WarpRNNTLoss(blank=blank_idx)
+        self.use_cuda = torch.cuda.is_available()
+
+    def forward(self, logits, logit_lens, y, y_lens):
+        """Computes RNNT loss.
+        All inputs are moved to the GPU with :py:meth:`torch.nn.Module.cuda` if
+        :py:func:`torch.cuda.is_available` was :py:data:`True` on
+        initialisation.
+        Args:
+            inputs: A tuple where the first element is the unnormalized network
+                :py:class:`torch.Tensor` outputs of size ``[batch, max_seq_len,
+                max_output_seq_len + 1, vocab_size + 1)``. The second element
+                is a Tuple of two :py:class:`torch.Tensor`s both of
+                size ``[batch]`` that contain the lengths of a) the audio features
+                logits and b) the target sequence logits.
+            targets: A tuple where the first element is a
+                :py:class:`torch.Tensor` such that each entry in the target
+                sequence is a class index. Target indices cannot be the blank
+                index. It must have size ``[batch, max_seq_len]``. In the former
+                form each target sequence is padded to the length of the longest
+                sequence and stacked.
+                The second element is a :py:class:`torch.Tensor` that gives
+                the lengths of the targets. Lengths are specified for each
+                sequence to achieve masking under the assumption that sequences
+                are padded to equal lengths.
+        """
+
+
+        # cast to required types
+        if logits.dtype != torch.float:
+            logits = logits.float()
+
+        if y.dtype != torch.int32:
+            y = y.int()
+
+        if logit_lens.dtype != torch.int32:
+            logit_lens = logit_lens.int()
+
+        if y_lens.dtype != torch.int32:
+            y_lens = y_lens.int()
+
+        # send to gpu
+        if self.use_cuda:
+            logits = logits.cuda()
+            logit_lens = logit_lens.cuda()
+            y = y.cuda()
+            y_lens = y_lens.cuda()
+
+        loss = self.rnnt_loss(
+            acts=logits, labels=y, act_lens=logit_lens, label_lens=y_lens
+        )
+
+        # del new variables that may have been created due to float/int/cuda()
+        del logits, y, logit_lens, y_lens
+
+        return loss
+
diff --git a/benchmarks/rnnt/ootb/train/rnnt/model.py b/benchmarks/rnnt/ootb/train/rnnt/model.py
new file mode 100644
index 0000000..6306866
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/rnnt/model.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from itertools import chain
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mlperf import logging
+
+from common.rnn import rnn
+
+
+class StackTime(nn.Module):
+    def __init__(self, factor):
+        super().__init__()
+        self.factor = int(factor)
+
+    def forward(self, x, x_lens):
+        # T, B, U
+        seq = [x]
+        for i in range(1, self.factor):
+            tmp = torch.zeros_like(x)
+            tmp[:-i, :, :] = x[i:, :, :]
+            seq.append(tmp)
+        # x_lens = torch.ceil(x_lens.float() / self.factor).int()
+        x_lens = (x_lens.int() + self.factor - 1) // self.factor
+        return torch.cat(seq, dim=2)[::self.factor, :, :], x_lens
+
+
+class RNNT(nn.Module):
+    """A Recurrent Neural Network Transducer (RNN-T).
+
+    Args:
+        in_features: Number of input features per step per batch.
+        vocab_size: Number of output symbols (inc blank).
+        forget_gate_bias: Total initialized value of the bias used in the
+            forget gate. Set to None to use PyTorch's default initialisation.
+            (See: http://proceedings.mlr.press/v37/jozefowicz15.pdf)
+        batch_norm: Use batch normalization in encoder and prediction network
+            if true.
+        encoder_n_hidden: Internal hidden unit size of the encoder.
+        encoder_rnn_layers: Encoder number of layers.
+        pred_n_hidden:  Internal hidden unit size of the prediction network.
+        pred_rnn_layers: Prediction network number of layers.
+        joint_n_hidden: Internal hidden unit size of the joint network.
+    """
+    def __init__(self, n_classes, in_feats, enc_n_hid,
+                 enc_pre_rnn_layers, enc_post_rnn_layers, enc_stack_time_factor,
+                 enc_dropout, pred_dropout, joint_dropout,
+                 pred_n_hid, pred_rnn_layers, joint_n_hid,
+                 forget_gate_bias,
+                 hidden_hidden_bias_scale=0.0, weights_init_scale=1.0,
+                 enc_lr_factor=1.0, pred_lr_factor=1.0, joint_lr_factor=1.0, mlperf=False):
+        super(RNNT, self).__init__()
+
+        self.enc_lr_factor = enc_lr_factor
+        self.pred_lr_factor = pred_lr_factor
+        self.joint_lr_factor = joint_lr_factor
+
+        self.pred_n_hid = pred_n_hid
+
+        pre_rnn_input_size = in_feats
+
+        post_rnn_input_size = enc_stack_time_factor * enc_n_hid
+
+        enc_mod = {}
+        enc_mod["pre_rnn"] = rnn(input_size=pre_rnn_input_size,
+                                 hidden_size=enc_n_hid,
+                                 num_layers=enc_pre_rnn_layers,
+                                 forget_gate_bias=forget_gate_bias,
+                                 hidden_hidden_bias_scale=hidden_hidden_bias_scale,
+                                 weights_init_scale=weights_init_scale,
+                                 dropout=enc_dropout,
+                                 tensor_name='pre_rnn',
+                                 mlperf=mlperf,
+                                 )
+
+        enc_mod["stack_time"] = StackTime(enc_stack_time_factor)
+
+        enc_mod["post_rnn"] = rnn(input_size=post_rnn_input_size,
+                                  hidden_size=enc_n_hid,
+                                  num_layers=enc_post_rnn_layers,
+                                  forget_gate_bias=forget_gate_bias,
+                                  hidden_hidden_bias_scale=hidden_hidden_bias_scale,
+                                  weights_init_scale=weights_init_scale,
+                                  dropout=enc_dropout,
+                                  tensor_name='post_rnn',
+                                  mlperf=mlperf,
+                                  )
+
+        self.encoder = torch.nn.ModuleDict(enc_mod)
+
+        pred_embed = torch.nn.Embedding(n_classes - 1, pred_n_hid)
+        if mlperf:
+            logging.log_event(logging.constants.WEIGHTS_INITIALIZATION,
+                              metadata=dict(tensor='pred_embed'))
+
+        self.prediction = torch.nn.ModuleDict({
+            "embed": pred_embed,
+            "dec_rnn": rnn(
+                input_size=pred_n_hid,
+                hidden_size=pred_n_hid,
+                num_layers=pred_rnn_layers,
+                forget_gate_bias=forget_gate_bias,
+                hidden_hidden_bias_scale=hidden_hidden_bias_scale,
+                weights_init_scale=weights_init_scale,
+                dropout=pred_dropout,
+                tensor_name='dec_rnn',
+                mlperf=mlperf,
+            ),
+        })
+
+        self.joint_pred = torch.nn.Linear(
+            pred_n_hid,
+            joint_n_hid)
+        if mlperf:
+            logging.log_event(logging.constants.WEIGHTS_INITIALIZATION,
+                              metadata=dict(tensor='joint_pred'))
+        self.joint_enc = torch.nn.Linear(
+            enc_n_hid,
+            joint_n_hid)
+        if mlperf:
+            logging.log_event(logging.constants.WEIGHTS_INITIALIZATION,
+                              metadata=dict(tensor='joint_enc'))
+
+        self.joint_net = nn.Sequential(
+            torch.nn.ReLU(inplace=True),
+            torch.nn.Dropout(p=joint_dropout),
+            torch.nn.Linear(joint_n_hid, n_classes))
+        if mlperf:
+            logging.log_event(logging.constants.WEIGHTS_INITIALIZATION,
+                              metadata=dict(tensor='joint_net'))
+
+    def forward(self, x, x_lens, y, y_lens, state=None):
+        # x: (B, channels, features, seq_len)
+        y = label_collate(y)
+
+        f, x_lens = self.encode(x, x_lens)
+
+        g, _ = self.predict(y, state)
+        out = self.joint(f, g)
+
+        return out, x_lens
+
+    def encode(self, x, x_lens):
+        """
+        Args:
+            x: tuple of ``(input, input_lens)``. ``input`` has shape (T, B, I),
+                ``input_lens`` has shape ``(B,)``.
+
+        Returns:
+            f: tuple of ``(output, output_lens)``. ``output`` has shape
+                (B, T, H), ``output_lens``
+        """
+        x, _ = self.encoder["pre_rnn"](x, None)
+        x, x_lens = self.encoder["stack_time"](x, x_lens)
+        x, _ = self.encoder["post_rnn"](x, None)
+
+        return x.transpose(0, 1), x_lens
+
+    def predict(self, y, state=None, add_sos=True):
+        """
+        B - batch size
+        U - label length
+        H - Hidden dimension size
+        L - Number of decoder layers = 2
+
+        Args:
+            y: (B, U)
+
+        Returns:
+            Tuple (g, hid) where:
+                g: (B, U + 1, H)
+                hid: (h, c) where h is the final sequence hidden state and c is
+                    the final cell state:
+                        h (tensor), shape (L, B, H)
+                        c (tensor), shape (L, B, H)
+        """
+        if y is not None:
+            # (B, U) -> (B, U, H)
+            y = self.prediction["embed"](y)
+        else:
+            B = 1 if state is None else state[0].size(1)
+            y = torch.zeros((B, 1, self.pred_n_hid)).to(
+                device=self.joint_enc.weight.device,
+                dtype=self.joint_enc.weight.dtype
+            )
+
+        # preprend blank "start of sequence" symbol
+        if add_sos:
+            B, U, H = y.shape
+            start = torch.zeros((B, 1, H)).to(device=y.device, dtype=y.dtype)
+            y = torch.cat([start, y], dim=1).contiguous()   # (B, U + 1, H)
+        else:
+            start = None   # makes del call later easier
+
+        y = y.transpose(0, 1)  # .contiguous()   # (U + 1, B, H)
+        g, hid = self.prediction["dec_rnn"](y, state)
+        g = g.transpose(0, 1)  # .contiguous()   # (B, U + 1, H)
+        del y, start, state
+        return g, hid
+
+    def joint(self, f, g):
+        """
+        f should be shape (B, T, H)
+        g should be shape (B, U + 1, H)
+
+        returns:
+            logits of shape (B, T, U, K + 1)
+        """
+        # Combine the input states and the output states
+        f = self.joint_enc(f)
+        g = self.joint_pred(g)
+
+        f = f.unsqueeze(dim=2)   # (B, T, 1, H)
+        g = g.unsqueeze(dim=1)   # (B, 1, U + 1, H)
+
+        res = self.joint_net(f + g)
+
+        del f, g
+        return res
+
+    def param_groups(self, lr):
+        chain_params = lambda *layers: chain(*[l.parameters() for l in layers])
+        return [{'params': chain_params(self.encoder),
+                 'lr': lr * self.enc_lr_factor},
+                {'params': chain_params(self.prediction),
+                 'lr': lr * self.pred_lr_factor},
+                {'params': chain_params(self.joint_enc, self.joint_pred, self.joint_net),
+                 'lr': lr * self.joint_lr_factor},
+                ]
+
+
+def label_collate(labels):
+    """Collates the label inputs for the rnn-t prediction network.
+
+    If `labels` is already in torch.Tensor form this is a no-op.
+
+    Args:
+        labels: A torch.Tensor List of label indexes or a torch.Tensor.
+
+    Returns:
+        A padded torch.Tensor of shape (batch, max_seq_len).
+    """
+
+    if isinstance(labels, torch.Tensor):
+        return labels.type(torch.int64)
+    if not isinstance(labels, (list, tuple)):
+        raise ValueError(
+            f"`labels` should be a list or tensor not {type(labels)}"
+        )
+
+    batch_size = len(labels)
+    max_len = max(len(l) for l in labels)
+
+    cat_labels = np.full((batch_size, max_len), fill_value=0.0, dtype=np.int32)
+    for e, l in enumerate(labels):
+        cat_labels[e, :len(l)] = l
+    labels = torch.LongTensor(cat_labels)
+
+    return labels
diff --git a/benchmarks/rnnt/ootb/train/rnnt_layers.svg b/benchmarks/rnnt/ootb/train/rnnt_layers.svg
new file mode 100755
index 0000000..8f98be9
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/rnnt_layers.svg
@@ -0,0 +1 @@
+<svg width="1364" height="502" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><defs><clipPath id="clip0"><rect x="303" y="256" width="1364" height="502"/></clipPath></defs><g clip-path="url(#clip0)" transform="translate(-303 -256)"><rect x="605.5" y="386.5" width="879" height="154" stroke="#92D050" stroke-width="1.33333" stroke-linejoin="round" stroke-miterlimit="10" fill="#262626"/><rect x="1484.5" y="257.5" width="176" height="496" stroke="#92D050" stroke-width="1.33333" stroke-linejoin="round" stroke-miterlimit="10" fill="#262626"/><rect x="304.5" y="540.5" width="1180" height="182" stroke="#92D050" stroke-width="1.33333" stroke-linejoin="round" stroke-miterlimit="10" fill="#262626"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="35" transform="translate(1505.31 300)">Joint net<tspan font-size="35" x="-663.494" y="120">Transcript prediction net</tspan><tspan font-size="35" x="-619.897" y="410">Audio encoding net</tspan></text><path d="M639.338 588.3 589.5 588.3" stroke="#C5C5C5" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M938.338 588.3 888.5 588.3" stroke="#C5C5C5" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M1237.34 588.3 1187.5 588.3" stroke="#A5A5A5" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M1536.34 540.5 1511.42 540.5 1511.42 587.983 1486.5 587.983" stroke="#7C7C7C" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M938.338 493.3 888.5 493.3" stroke="#C5C5C5" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M1237.34 493.3 1187.5 493.3" stroke="#A5A5A5" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M1536.34 540.983 1511.42 540.983 1511.42 493.5 1486.5 493.5" stroke="#7C7C7C" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><rect x="1536.5" y="340.5" width="76" height="400" stroke="#FFFFFF" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#494949"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="41" transform="matrix(6.12323e-17 1 -1 6.12323e-17 1561.41 347)">ReLU + FC 512<tspan font-size="41" x="263" y="1.61041e-14">-</tspan><tspan font-size="41" x="278" y="1.70226e-14">&gt;</tspan><tspan font-size="41" x="300" y="1.83697e-14">1024</tspan></text><rect x="1237.5" y="455.5" width="249" height="76" stroke="#FFFFFF" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#545454"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="35" transform="translate(1266.71 503)">FC 512-<tspan font-size="35" x="116.64" y="0">&gt;512</tspan></text><rect x="938.5" y="455.5" width="249" height="76" stroke="#FFFFFF" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#616161"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="35" transform="translate(967.399 503)">2x LSTM 512</text><rect x="639.5" y="455.5" width="249" height="76" stroke="#FFFFFF" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#A5A5A5"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="35" transform="translate(646.2 485)">Embeding <tspan font-size="35" x="161.547" y="0">1023</tspan><tspan font-size="35" x="74.4133" y="36">-</tspan><tspan font-size="35" x="87.14" y="36">&gt; 512</tspan></text><rect x="1237.5" y="549.5" width="249" height="76" stroke="#FFFFFF" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#545454"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="35" transform="translate(1257.62 598)">FC 1024<tspan font-size="35" x="122.093" y="0">-</tspan><tspan font-size="35" x="134.82" y="0">&gt;512</tspan></text><rect x="938.5" y="549.5" width="249" height="76" stroke="#FFFFFF" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#616161"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="35" transform="translate(958.309 598)">3x LSTM 1024</text><rect x="639.5" y="549.5" width="249" height="76" stroke="#FFFFFF" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#A5A5A5"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="35" transform="translate(693.65 580)">2x stack,<tspan font-size="35" x="-28.9667" y="36">2x subsample</tspan></text><rect x="340.5" y="549.5" width="249" height="76" stroke="#FFFFFF" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#616161"/><text fill="#FFFFFF" font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="35" transform="translate(360.25 598)">2x LSTM 1024</text></g></svg>
\ No newline at end of file
diff --git a/benchmarks/rnnt/ootb/train/run_and_time.sh b/benchmarks/rnnt/ootb/train/run_and_time.sh
new file mode 100755
index 0000000..cb87844
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/run_and_time.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# runs benchmark and reports time to convergence
+# to use the script:
+#   run_and_time.sh
+
+set -e
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# run benchmark
+#set -x
+
+echo "running benchmark"
+
+bash scripts/train.sh
+
+ret_code=$?
+
+set +x
+
+sleep 3
+if [[ $ret_code != 0 ]]; then exit $ret_code; fi
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"
+
+# report result
+result=$(( $end - $start ))
+result_name="RNN_SPEECH_RECOGNITION"
+
+echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
+
diff --git a/benchmarks/rnnt/ootb/train/scripts/create_sentencepieces.sh b/benchmarks/rnnt/ootb/train/scripts/create_sentencepieces.sh
new file mode 100644
index 0000000..371a1e8
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/create_sentencepieces.sh
@@ -0,0 +1,19 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mkdir -p $DATASET_DIR/sentencepieces
+jq -r '.[]["transcript"]' $DATASET_DIR/LibriSpeech/librispeech-train-*-wav.json > tmptxt.txt
+python -c "import sentencepiece as spm; spm.SentencePieceTrainer.train(input='tmptxt.txt', model_prefix='librispeech1023', vocab_size=1023, character_coverage=1.0, bos_id=-1, eos_id=-1, model_type='unigram')"
+cp librispeech1023.* $DATASET_DIR/sentencepieces/
+rm tmptxt.txt
diff --git a/benchmarks/rnnt/ootb/train/scripts/docker/build.sh b/benchmarks/rnnt/ootb/train/scripts/docker/build.sh
new file mode 100755
index 0000000..23b17c3
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/docker/build.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker build . --rm -t mlperf/rnn_speech_recognition
diff --git a/benchmarks/rnnt/ootb/train/scripts/docker/launch.sh b/benchmarks/rnnt/ootb/train/scripts/docker/launch.sh
new file mode 100755
index 0000000..8138272
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/docker/launch.sh
@@ -0,0 +1,32 @@
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/bin/bash
+
+DATA_DIR=$1
+CHECKPOINT_DIR=$2
+RESULT_DIR=$3
+
+docker run -it --rm \
+  --gpus='all' \
+  --shm-size=4g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  -v "$DATA_DIR":/datasets \
+  -v "$CHECKPOINT_DIR":/checkpoints/ \
+  -v "$RESULT_DIR":/results/ \
+  -v $PWD:/code \
+  -v $PWD:/workspace/rnnt \
+  mlperf/rnn_speech_recognition bash
diff --git a/benchmarks/rnnt/ootb/train/scripts/download_librispeech.sh b/benchmarks/rnnt/ootb/train/scripts/download_librispeech.sh
new file mode 100755
index 0000000..2e353bb
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/download_librispeech.sh
@@ -0,0 +1,31 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/usr/bin/env bash
+
+script_dir=`dirname "${BASH_SOURCE[0]}"`
+set -x
+UTILS_DIR="$script_dir/../utils"
+
+DATASET="LibriSpeech"
+DATA_DIR="$DATASET_DIR/$DATASET"
+if [ ! -d "$DATA_DIR" ]
+then
+    mkdir -p $DATA_DIR
+    chmod go+rx $DATASET_DIR
+    python $UTILS_DIR/download_librispeech.py $UTILS_DIR/librispeech.csv $DATA_DIR -e ${DATASET_DIR}/
+else
+    echo "Directory $DATA_DIR already exists."
+fi
diff --git a/benchmarks/rnnt/ootb/train/scripts/inference.sh b/benchmarks/rnnt/ootb/train/scripts/inference.sh
new file mode 100755
index 0000000..c763063
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/inference.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO Check if NUM_STEPS is still supported and how is set in infer/bench
+# TODO Check if multi-gpu works ok
+
+: ${DATA_DIR:=${1:-"/datasets/LibriSpeech"}}
+: ${MODEL_CONFIG:=${2:-"configs/rnnt.yaml"}}
+: ${OUTPUT_DIR:=${3:-"/results"}}
+: ${CHECKPOINT:=${4:-"/checkpoints/rnnt_fp16.pt"}}
+: ${DATASET:="dev-clean"}
+: ${CUDNN_BENCHMARK:=false}
+: ${MAX_DURATION:=""}
+: ${PAD_TO_MAX_DURATION:=false}
+: ${NUM_GPUS:=1}
+: ${NUM_STEPS:="-1"}
+: ${AMP:=true}
+: ${BATCH_SIZE:=8}
+: ${EMA:=false}
+: ${SEED:=0}
+: ${DALI_DEVICE:="none"}
+: ${CPU:=false}
+: ${LOGITS_FILE:=}
+: ${PREDICTION_FILE="${OUTPUT_DIR}/${DATASET}.predictions"}
+: ${REPEATS:=1}
+
+mkdir -p "$OUTPUT_DIR"
+
+ARGS="--dataset_dir=$DATA_DIR"
+ARGS+=" --val_manifest=$DATA_DIR/librispeech-${DATASET}-wav.json"
+ARGS+=" --model_config=$MODEL_CONFIG"
+ARGS+=" --output_dir=$OUTPUT_DIR"
+ARGS+=" --batch_size=$BATCH_SIZE"
+ARGS+=" --seed=$SEED"
+ARGS+=" --dali_device=$DALI_DEVICE"
+ARGS+=" --repeats=$REPEATS"
+
+[ "$AMP" = true ] &&                 ARGS+=" --amp"
+[ "$EMA" = true ] &&                 ARGS+=" --ema"
+[ "$CUDNN_BENCHMARK" = true ] &&     ARGS+=" --cudnn_benchmark"
+[ -n "$CHECKPOINT" ] &&              ARGS+=" --ckpt=$CHECKPOINT"
+[ "$NUM_STEPS" -gt 0 ] &&            ARGS+=" --steps $NUM_STEPS"
+[ -n "$PREDICTION_FILE" ] &&         ARGS+=" --save_prediction $PREDICTION_FILE"
+[ -n "$LOGITS_FILE" ] &&             ARGS+=" --logits_save_to $LOGITS_FILE"
+[ "$CPU" = true ] &&                 ARGS+=" --cpu"
+[ -n "$MAX_DURATION" ] &&            ARGS+=" --max_duration $MAX_DURATION"
+[ "$PAD_TO_MAX_DURATION" = true ] && ARGS+=" --pad_to_max_duration"
+
+python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS inference.py $ARGS
diff --git a/benchmarks/rnnt/ootb/train/scripts/inference_benchmark.sh b/benchmarks/rnnt/ootb/train/scripts/inference_benchmark.sh
new file mode 100755
index 0000000..cea4afc
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/inference_benchmark.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -a
+
+: ${CUDNN_BENCHMARK:=true}
+: ${MAX_DURATION:=36}
+: ${PAD_TO_MAX_DURATION:=true}
+
+bash ./scripts/inference.sh "$@"
diff --git a/benchmarks/rnnt/ootb/train/scripts/preprocess_librispeech.sh b/benchmarks/rnnt/ootb/train/scripts/preprocess_librispeech.sh
new file mode 100755
index 0000000..02f4c93
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/preprocess_librispeech.sh
@@ -0,0 +1,54 @@
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+
+script_dir=`dirname "${BASH_SOURCE[0]}"`
+set -x
+UTILS_DIR="$script_dir/../utils"
+
+python $UTILS_DIR/convert_librispeech.py \
+    --input_dir $DATASET_DIR/LibriSpeech/train-clean-100 \
+    --dest_dir $DATASET_DIR/LibriSpeech/train-clean-100-wav \
+    --output_json $DATASET_DIR/LibriSpeech/librispeech-train-clean-100-wav.json
+python $UTILS_DIR/convert_librispeech.py \
+    --input_dir $DATASET_DIR/LibriSpeech/train-clean-360 \
+    --dest_dir $DATASET_DIR/LibriSpeech/train-clean-360-wav \
+    --output_json $DATASET_DIR/LibriSpeech/librispeech-train-clean-360-wav.json
+python $UTILS_DIR/convert_librispeech.py \
+    --input_dir $DATASET_DIR/LibriSpeech/train-other-500 \
+    --dest_dir $DATASET_DIR/LibriSpeech/train-other-500-wav \
+    --output_json $DATASET_DIR/LibriSpeech/librispeech-train-other-500-wav.json
+
+
+python $UTILS_DIR/convert_librispeech.py \
+    --input_dir $DATASET_DIR/LibriSpeech/dev-clean \
+    --dest_dir $DATASET_DIR/LibriSpeech/dev-clean-wav \
+    --output_json $DATASET_DIR/LibriSpeech/librispeech-dev-clean-wav.json
+python $UTILS_DIR/convert_librispeech.py \
+    --input_dir $DATASET_DIR/LibriSpeech/dev-other \
+    --dest_dir $DATASET_DIR/LibriSpeech/dev-other-wav \
+    --output_json $DATASET_DIR/LibriSpeech/librispeech-dev-other-wav.json
+
+
+python $UTILS_DIR/convert_librispeech.py \
+    --input_dir $DATASET_DIR/LibriSpeech/test-clean \
+    --dest_dir $DATASET_DIR/LibriSpeech/test-clean-wav \
+    --output_json $DATASET_DIR/LibriSpeech/librispeech-test-clean-wav.json
+python $UTILS_DIR/convert_librispeech.py \
+    --input_dir $DATASET_DIR/LibriSpeech/test-other \
+    --dest_dir $DATASET_DIR/LibriSpeech/test-other-wav \
+    --output_json $DATASET_DIR/LibriSpeech/librispeech-test-other-wav.json
+
+bash $script_dir/create_sentencepieces.sh
diff --git a/benchmarks/rnnt/ootb/train/scripts/train.sh b/benchmarks/rnnt/ootb/train/scripts/train.sh
new file mode 100755
index 0000000..8796009
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/train.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export OMP_NUM_THREADS=1
+
+: ${DATA_DIR:=${1:-"$DATASET_DIR/LibriSpeech"}}
+: ${MODEL_CONFIG:=${2:-"configs/baseline_v3-1023sp.yaml"}}
+: ${OUTPUT_DIR:=${3:-"$RESULT_DIR"}}
+: ${FB5LOGGER:=${4}}
+: ${FB5CONFIG:=${5}}
+: ${CHECKPOINT:-}
+: ${CUDNN_BENCHMARK:=true}
+: ${NUM_GPUS:=1}
+: ${AMP:=false}
+: ${GLOBAL_BATCH_SIZE:=1024}
+: ${VAL_BATCH_SIZE:=2}
+: ${GRAD_ACCUMULATION_STEPS:=64}  # 8
+: ${LEARNING_RATE:=0.004}
+: ${LR_EXP_GAMMA:=0.935}  # ~0.005 in 80 epochs
+: ${NUM_BUCKETS=6} # empty means to use torch.utils.data.distributed.DistributedSampler
+: ${EMA:=0.999}
+: ${SEED=1}
+: ${EPOCHS:=100}
+: ${WARMUP_EPOCHS:=6}  # 8000 steps with 1x8x24 should be ~5.6 epochs
+: ${HOLD_EPOCHS:=40}
+: ${SAVE_AT_THE_END:=false}
+: ${EPOCHS_THIS_JOB:=0}
+: ${RESUME:=true}
+: ${DALI_DEVICE:="cpu"}
+: ${VAL_FREQUENCY:=1}
+: ${PREDICTION_FREQUENCY:=1000}
+: ${BETA1:=0.9}
+: ${BETA2:=0.999}
+: ${LOG_FREQUENCY:=1}
+: ${TRAIN_MANIFESTS:="$DATA_DIR/librispeech-train-clean-100-wav.json \
+                      $DATA_DIR/librispeech-train-clean-360-wav.json \
+                      $DATA_DIR/librispeech-train-other-500-wav.json"}
+: ${VAL_MANIFESTS:="$DATA_DIR/librispeech-dev-clean-wav.json"}
+: ${LOG_NORM:=false}
+: ${USE_OLD_VAL:=true}
+: ${USE_NEW_VAL:=false}
+: ${MAX_SYMBOL_PER_SAMPLE=300}
+: ${WEIGHTS_INIT_SCALE=0.5}
+: ${CLIP_NORM:=1}
+
+BATCH_SIZE=$(( $GLOBAL_BATCH_SIZE / $NUM_GPUS ))
+
+mkdir -p "$OUTPUT_DIR"
+
+ARGS="--dataset_dir=$DATA_DIR"
+ARGS+=" --val_manifests $VAL_MANIFESTS"
+ARGS+=" --train_manifests $TRAIN_MANIFESTS"
+ARGS+=" --model_config=$MODEL_CONFIG"
+ARGS+=" --output_dir=$OUTPUT_DIR"
+ARGS+=" --lr=$LEARNING_RATE"
+ARGS+=" --batch_size=$BATCH_SIZE"
+ARGS+=" --val_batch_size=$VAL_BATCH_SIZE"
+ARGS+=" --min_lr=1e-5"
+ARGS+=" --lr_exp_gamma=$LR_EXP_GAMMA"
+ARGS+=" --epochs=$EPOCHS"
+ARGS+=" --warmup_epochs=$WARMUP_EPOCHS"
+ARGS+=" --hold_epochs=$HOLD_EPOCHS"
+ARGS+=" --epochs_this_job=$EPOCHS_THIS_JOB"
+ARGS+=" --ema=$EMA"
+ARGS+=" --seed=$SEED"
+ARGS+=" --weight_decay=1e-3"
+ARGS+=" --log_frequency=$LOG_FREQUENCY"
+ARGS+=" --val_frequency=$VAL_FREQUENCY"
+ARGS+=" --grad_accumulation_steps=$GRAD_ACCUMULATION_STEPS "
+ARGS+=" --dali_device=$DALI_DEVICE"
+ARGS+=" --beta1=$BETA1"
+ARGS+=" --beta2=$BETA2"
+
+[ -n "$FB5LOGGER" ] &&               ARGS+=" --fb5logger=$FB5LOGGER"
+[ -n "$FB5CONFIG" ] &&               ARGS+=" --fb5config=$FB5CONFIG"
+[ "$AMP" = true ] &&                 ARGS+=" --amp"
+[ "$RESUME" = true ] &&              ARGS+=" --resume"
+[ "$CUDNN_BENCHMARK" = true ] &&     ARGS+=" --cudnn_benchmark"
+[ "$LOG_NORM" = true ] &&            ARGS+=" --log_norm"
+[ "$SAVE_AT_THE_END" = true ] &&     ARGS+=" --save_at_the_end"
+[ -n "$CHECKPOINT" ] &&              ARGS+=" --ckpt=$CHECKPOINT"
+[ -n "$NUM_BUCKETS" ] &&             ARGS+=" --num_buckets=$NUM_BUCKETS"
+[ -n "$TARGET" ] &&                  ARGS+=" --target=$TARGET"
+[ -n "$CLIP_NORM" ] &&               ARGS+=" --clip_norm=$CLIP_NORM"
+[ -n "$PREDICTION_FREQUENCY" ] &&    ARGS+=" --prediction_frequency=$PREDICTION_FREQUENCY"
+[ -n "$SAVE_MILESTONES" ] &&         ARGS+=" --keep_milestones $SAVE_MILESTONES"
+[ -n "$SAVE_BEST" ] &&               ARGS+=" --save_best_from=$SAVE_BEST"
+[ -n "$SAVE_FREQUENCY" ] &&          ARGS+=" --save_frequency=$SAVE_FREQUENCY"
+[ -n "$START_CLIP" ] &&              ARGS+=" --start_clip=$START_CLIP"
+[ -n "$HIDDEN_HIDDEN_BIAS_SCALED" ] && ARGS+=" --hidden_hidden_bias_scale=$HIDDEN_HIDDEN_BIAS_SCALED"
+[ -n "$WEIGHTS_INIT_SCALE" ] &&      ARGS+=" --weights_init_scale=$WEIGHTS_INIT_SCALE"
+[ -n "$MAX_SYMBOL_PER_SAMPLE" ] &&  ARGS+=" --max_symbol_per_sample=$MAX_SYMBOL_PER_SAMPLE"
+
+DISTRIBUTED=${DISTRIBUTED:-"-m torch.distributed.launch --nproc_per_node=$NUM_GPUS"}
+script_dir=`dirname "${BASH_SOURCE[0]}"`
+set -x
+python ${DISTRIBUTED} "$script_dir/../train.py" ${ARGS}
diff --git a/benchmarks/rnnt/ootb/train/scripts/train_bench.sh b/benchmarks/rnnt/ootb/train/scripts/train_bench.sh
new file mode 100755
index 0000000..420c8fe
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/train_bench.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo -e "\nNVIDIA container build: ${NVIDIA_BUILD_ID}\n"
+
+export OMP_NUM_THREADS=1
+
+DATA_DIR=${DATA_DIR:-"/datasets/LibriSpeech"}
+MODEL_CONFIG=${MODEL_CONFIG:-"configs/rnnt.yaml"}
+OUTPUT_DIR=${OUTPUT_DIR:-"/results"}
+CHECKPOINT=${CHECKPOINT:-""}
+CREATE_LOGFILE=${CREATE_LOGFILE:-"true"}
+CUDNN_BENCHMARK=${CUDNN_BENCHMARK:-"true"}
+NUM_GPUS=${NUM_GPUS:-8}
+AMP=${AMP:-"true"}
+EPOCHS=${EPOCHS:-100}
+WARMUP_EPOCHS=${WARMUP_EPOCHS:-6}  # 8000 steps with 1x8x24 should be ~5.6 epochs
+HOLD_EPOCHS=${HOLD_EPOCHS:-0}
+SEED=${SEED:-1}
+BATCH_SIZE=${BATCH_SIZE:-8}
+VAL_BATCH_SIZE=${VAL_BATCH_SIZE:-2}
+OPTIMIZER=${OPTIMIZER:-"adamw"}
+LEARNING_RATE=${LEARNING_RATE:-"0.001"}
+LR_POLICY=${LR_POLICY:-"legacy"}
+# LR_EXP_GAMMA=${LR_EXP_GAMMA:-0.981}
+GRADIENT_ACCUMULATION_STEPS=${GRADIENT_ACCUMULATION_STEPS:-1}
+EMA=${EMA:-0.0}  # XXX
+SAVE_FREQUENCY=${SAVE_FREQUENCY:-10}
+EPOCHS_THIS_JOB=${EPOCHS_THIS_JOB:-0}
+RESUME=${RESUME:-"true"}
+DALI_DEVICE=${DALI_DEVICE:-"none"}
+
+mkdir -p "$OUTPUT_DIR"
+
+ARGS=" --batch_size=$BATCH_SIZE"
+ARGS+=" --val_batch_size=$VAL_BATCH_SIZE"
+ARGS+=" --output_dir=$OUTPUT_DIR"
+ARGS+=" --model_config=$MODEL_CONFIG"
+ARGS+=" --lr=$LEARNING_RATE"
+ARGS+=" --min_lr=1e-5"
+ARGS+=" --lr_policy=$LR_POLICY"
+# ARGS+=" --lr_exp_gamma=$LR_EXP_GAMMA"
+ARGS+=" --epochs=$EPOCHS"
+ARGS+=" --warmup_epochs=$WARMUP_EPOCHS"
+ARGS+=" --hold_epochs=$HOLD_EPOCHS"
+ARGS+=" --epochs_this_job=$EPOCHS_THIS_JOB"
+ARGS+=" --ema=$EMA"
+ARGS+=" --seed=$SEED"
+ARGS+=" --optimizer=$OPTIMIZER"
+ARGS+=" --dataset_dir=$DATA_DIR"
+ARGS+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
+ARGS+=" --train_manifest=$DATA_DIR/librispeech-bench-clean-wav.json"
+# ARGS+=",$DATA_DIR/librispeech-train-clean-360-wav.json"
+# ARGS+=",$DATA_DIR/librispeech-train-other-500-wav.json"
+ARGS+=" --weight_decay=1e-3"
+ARGS+=" --save_frequency=$SAVE_FREQUENCY"
+ARGS+=" --eval_frequency=1000"  # XXX =100
+ARGS+=" --train_frequency=1"
+ARGS+=" --print_prediction_frequency=100"
+ARGS+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS "
+ARGS+=" --dali_device=$DALI_DEVICE"
+[ "$AMP" == "true" ] && \
+ARGS+=" --amp"
+[ "$RESUME" == "true" ] && \
+ARGS+=" --resume"
+[ "$CUDNN_BENCHMARK" = "true" ] && \
+ARGS+=" --cudnn_benchmark"
+[ -n "$CHECKPOINT" ] && \
+ARGS+=" --ckpt=${CHECKPOINT}"
+
+python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS train.py $ARGS
diff --git a/benchmarks/rnnt/ootb/train/scripts/train_debug.sh b/benchmarks/rnnt/ootb/train/scripts/train_debug.sh
new file mode 100755
index 0000000..224f525
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/train_debug.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export OMP_NUM_THREADS=1
+
+DATA_DIR="/datasets/LibriSpeech"
+TRAIN_MANIFESTS="$DATA_DIR/librispeech-train-clean-100-wav.json"
+NUM_GPUS=1
+
+: ${DATA_DIR:=${1:-"/datasets/LibriSpeech"}}
+: ${MODEL_CONFIG:=${2:-"configs/rnnt.yaml"}}
+: ${OUTPUT_DIR:=${3:-"/results"}}
+: ${CHECKPOINT:=${4:-}}
+: ${CUDNN_BENCHMARK:=true}
+: ${NUM_GPUS:=8}
+: ${AMP:=true}
+: ${BATCH_SIZE:=8}
+: ${VAL_BATCH_SIZE:=8}
+: ${OPTIMIZER:=adamw}
+: ${GRAD_ACCUMULATION_STEPS:=1}
+: ${LEARNING_RATE:=0.001}
+# : ${MIN_LEARNING_RATE:=0.00001}
+: ${LR_POLICY:=legacy}
+: ${LR_EXP_GAMMA:=0.935}  # ~0.005 in 80 epochs
+: ${EMA:=0.999}
+: ${SEED:=1}
+: ${EPOCHS:=100}
+: ${WARMUP_EPOCHS:=6}  # 8000 steps with 1x8x24 should be ~5.6 epochs
+: ${HOLD_EPOCHS:=0}
+: ${SAVE_FREQUENCY:=10}
+: ${EPOCHS_THIS_JOB:=0}
+: ${RESUME:=true}
+: ${DALI_DEVICE:="none"}
+: ${PAD_TO_MAX_DURATION:=false}
+: ${VAL_FREQUENCY:=10000}
+: ${PREDICTION_FREQUENCY:=1000}
+: ${TRAIN_MANIFESTS:="$DATA_DIR/librispeech-train-clean-100-wav.json \
+                      $DATA_DIR/librispeech-train-clean-360-wav.json \
+                      $DATA_DIR/librispeech-train-other-500-wav.json"}
+: ${VAL_MANIFESTS:="$DATA_DIR/librispeech-dev-clean-wav.json"}
+
+: ${PDB:=false}
+
+mkdir -p "$OUTPUT_DIR"
+
+ARGS="--dataset_dir=$DATA_DIR"
+ARGS+=" --val_manifests $VAL_MANIFESTS"
+ARGS+=" --train_manifests $TRAIN_MANIFESTS"
+ARGS+=" --model_config=$MODEL_CONFIG"
+ARGS+=" --output_dir=$OUTPUT_DIR"
+ARGS+=" --lr=$LEARNING_RATE"
+ARGS+=" --batch_size=$BATCH_SIZE"
+ARGS+=" --val_batch_size=$VAL_BATCH_SIZE"
+ARGS+=" --min_lr=1e-5"
+ARGS+=" --lr_policy=$LR_POLICY"
+ARGS+=" --lr_exp_gamma=$LR_EXP_GAMMA"
+ARGS+=" --epochs=$EPOCHS"
+ARGS+=" --warmup_epochs=$WARMUP_EPOCHS"
+ARGS+=" --hold_epochs=$HOLD_EPOCHS"
+ARGS+=" --epochs_this_job=$EPOCHS_THIS_JOB"
+ARGS+=" --ema=$EMA"
+ARGS+=" --seed=$SEED"
+ARGS+=" --optimizer=$OPTIMIZER"
+ARGS+=" --weight_decay=1e-3"
+ARGS+=" --save_frequency=$SAVE_FREQUENCY"
+ARGS+=" --keep_milestones 50 100 150 200"
+ARGS+=" --save_best_from=80"
+ARGS+=" --log_frequency=1"
+ARGS+=" --val_frequency=$VAL_FREQUENCY"
+ARGS+=" --prediction_frequency=$PREDICTION_FREQUENCY"
+ARGS+=" --grad_accumulation_steps=$GRAD_ACCUMULATION_STEPS "
+ARGS+=" --dali_device=$DALI_DEVICE"
+
+[ "$AMP" = true ] &&                 ARGS+=" --amp"
+[ "$RESUME" = true ] &&              ARGS+=" --resume"
+[ "$CUDNN_BENCHMARK" = true ] &&     ARGS+=" --cudnn_benchmark"
+[ -n "$CHECKPOINT" ] &&              ARGS+=" --ckpt=$CHECKPOINT"
+
+DISTRIBUTED=${DISTRIBUTED:-"-m torch.distributed.launch --nproc_per_node=$NUM_GPUS"}
+
+[ "$PDB" = true ] &&                 DISTRIBUTED="-m ipdb"
+
+python ${DISTRIBUTED} train.py ${ARGS}
diff --git a/benchmarks/rnnt/ootb/train/scripts/train_refactor.sh b/benchmarks/rnnt/ootb/train/scripts/train_refactor.sh
new file mode 100644
index 0000000..58b5a4f
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/scripts/train_refactor.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo -e "\nNVIDIA container build: ${NVIDIA_BUILD_ID}\n"
+
+export OMP_NUM_THREADS=1
+
+DATA_DIR=${DATA_DIR:-"/datasets/LibriSpeech"}
+MODEL_CONFIG=${MODEL_CONFIG:-"configs/rnnt.yaml"}
+OUTPUT_DIR=${OUTPUT_DIR:-"/results"}
+CHECKPOINT=${CHECKPOINT:-""}
+CREATE_LOGFILE=${CREATE_LOGFILE:-"true"}
+CUDNN_BENCHMARK=${CUDNN_BENCHMARK:-"true"}
+NUM_GPUS=${NUM_GPUS:-8}
+AMP=${AMP:-"true"}
+EPOCHS=${EPOCHS:-100}
+WARMUP_EPOCHS=${WARMUP_EPOCHS:-6}  # 8000 steps with 1x8x24 should be ~5.6 epochs
+HOLD_EPOCHS=${HOLD_EPOCHS:-0}
+SEED=${SEED:-1}
+BATCH_SIZE=${BATCH_SIZE:-24}
+VAL_BATCH_SIZE=${VAL_BATCH_SIZE:-2}
+OPTIMIZER=${OPTIMIZER:-"adamw"}
+LEARNING_RATE=${LEARNING_RATE:-"0.001"}
+LR_POLICY=${LR_POLICY:-"legacy"}
+# LR_EXP_GAMMA=${LR_EXP_GAMMA:-0.981}
+GRADIENT_ACCUMULATION_STEPS=${GRADIENT_ACCUMULATION_STEPS:-2}
+EMA=${EMA:-0.0}  # XXX
+SAVE_FREQUENCY=${SAVE_FREQUENCY:-10}
+EPOCHS_THIS_JOB=${EPOCHS_THIS_JOB:-0}
+RESUME=${RESUME:-"true"}
+DALI_DEVICE=${DALI_DEVICE:-"none"}
+
+mkdir -p "$OUTPUT_DIR"
+
+ARGS=" --batch_size=$BATCH_SIZE"
+ARGS+=" --val_batch_size=$VAL_BATCH_SIZE"
+ARGS+=" --output_dir=$OUTPUT_DIR"
+ARGS+=" --model_config=$MODEL_CONFIG"
+ARGS+=" --lr=$LEARNING_RATE"
+ARGS+=" --min_lr=1e-5"
+ARGS+=" --lr_policy=$LR_POLICY"
+# ARGS+=" --lr_exp_gamma=$LR_EXP_GAMMA"
+ARGS+=" --epochs=$EPOCHS"
+ARGS+=" --warmup_epochs=$WARMUP_EPOCHS"
+ARGS+=" --hold_epochs=$HOLD_EPOCHS"
+ARGS+=" --epochs_this_job=$EPOCHS_THIS_JOB"
+ARGS+=" --ema=$EMA"
+ARGS+=" --seed=$SEED"
+ARGS+=" --optimizer=$OPTIMIZER"
+ARGS+=" --dataset_dir=$DATA_DIR"
+ARGS+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
+# ARGS+=" --train_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"  # XXX
+ARGS+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json"
+# ARGS+=",$DATA_DIR/librispeech-train-clean-360-wav.json"
+# ARGS+=",$DATA_DIR/librispeech-train-other-500-wav.json"
+ARGS+=" --weight_decay=1e-3"
+ARGS+=" --save_frequency=$SAVE_FREQUENCY"
+ARGS+=" --eval_frequency=1000"  # XXX =100
+ARGS+=" --train_frequency=1"
+ARGS+=" --print_prediction_frequency=100"
+ARGS+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS "
+ARGS+=" --dali_device=$DALI_DEVICE"
+[ "$AMP" == "true" ] && \
+ARGS+=" --amp"
+[ "$RESUME" == "true" ] && \
+ARGS+=" --resume"
+[ "$CUDNN_BENCHMARK" = "true" ] && \
+ARGS+=" --cudnn_benchmark"
+[ -n "$CHECKPOINT" ] && \
+ARGS+=" --ckpt=${CHECKPOINT}"
+
+python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS train.py $ARGS
diff --git a/benchmarks/rnnt/ootb/train/tests/Dockerfile b/benchmarks/rnnt/ootb/train/tests/Dockerfile
new file mode 100644
index 0000000..926ca01
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/tests/Dockerfile
@@ -0,0 +1,14 @@
+FROM nvcr.io/nvidia/pytorch:20.10-py3
+
+COPY tests/requirements.txt .
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+
+
+WORKDIR /code
+
+CMD bash
diff --git a/benchmarks/rnnt/ootb/train/tests/requirements.txt b/benchmarks/rnnt/ootb/train/tests/requirements.txt
new file mode 100644
index 0000000..b54b1d6
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/tests/requirements.txt
@@ -0,0 +1 @@
+pytest==6.1.2
diff --git a/benchmarks/rnnt/ootb/train/tests/rnnt/dataset/test_rnnt_wordpiece_tokenizer.py b/benchmarks/rnnt/ootb/train/tests/rnnt/dataset/test_rnnt_wordpiece_tokenizer.py
new file mode 100644
index 0000000..eda794f
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/tests/rnnt/dataset/test_rnnt_wordpiece_tokenizer.py
@@ -0,0 +1,52 @@
+import pytest
+
+from rnnt.dataset import RNNTWordpieceTokenizer
+
+
+@pytest.fixture
+def vocab_file():
+    return 'wordpieces/bert-base-1000.txt'
+
+
+@pytest.fixture
+def tokenizer(vocab_file):
+    charset = "abcdefghijklmnopqrstuvwxyz '"
+    return RNNTWordpieceTokenizer(vocab_file, charset)
+
+
+def test_labels(vocab_file, tokenizer):
+    with open(vocab_file) as f:
+        labels = f.read().splitlines()
+
+    assert tokenizer.labels == labels + ['<BLANK>']
+
+
+def test_blank_idex(vocab_file, tokenizer):
+    with open(vocab_file) as f:
+        lines = sum(1 for line in f)
+
+    assert tokenizer.blank_index == lines
+
+
+def test_empty_input(tokenizer):
+    assert tokenizer('') == []
+
+
+def detokenize(tokens):
+    output = ' '.join(tokens)
+    output = output.replace(' ##', '')
+    output = output.replace('##', '')
+    return output.strip()
+
+
+def test_expected_input(tokenizer):
+    expected_input = 'expected input'
+    tokens = tokenizer(expected_input)
+    assert detokenize(tokens) == expected_input
+
+
+def test_unexpected_input(tokenizer):
+    unexpected_input = 'unexpected input!!!'
+    tokens = tokenizer(unexpected_input)
+    assert detokenize(tokens) == 'unexpected <BLANK>'
+
diff --git a/benchmarks/rnnt/ootb/train/train.py b/benchmarks/rnnt/ootb/train/train.py
new file mode 100644
index 0000000..b866b3b
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/train.py
@@ -0,0 +1,617 @@
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import os
+import random
+import time
+import sys
+
+import torch
+import numpy as np
+import torch.distributed as dist
+from apex import amp
+from apex.optimizers import FusedLAMB
+from apex.parallel import DistributedDataParallel
+
+from common import helpers
+from common.data.dali import sampler as dali_sampler
+from common.data.dali.data_loader import DaliDataLoader
+from common.data.text import Tokenizer
+from common.data import features
+from common.helpers import (Checkpointer, greedy_wer, num_weights, print_once,
+                            process_evaluation_epoch)
+from common.optimizers import lr_policy
+from common.tb_dllogger import flush_log, init_log, log
+from rnnt import config
+from rnnt.decoder import RNNTGreedyDecoder
+from rnnt.loss import RNNTLoss
+from rnnt.model import RNNT
+
+from mlperf import logging
+
+# FB5 Logger
+import pathlib
+p = pathlib.Path(__file__).parent.resolve() / "../../../../fb5logging"
+sys.path.append(os.fspath(p))
+from fb5logger import FB5Logger
+import loggerconstants
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='RNN-T Training Reference')
+
+    training = parser.add_argument_group('training setup')
+    training.add_argument('--epochs', default=100, type=int,
+                          help='Number of epochs for the entire training')
+    training.add_argument("--warmup_epochs", default=6, type=int,
+                          help='Initial epochs of increasing learning rate')
+    training.add_argument("--hold_epochs", default=40, type=int,
+                          help='Constant max learning rate epochs after warmup')
+    training.add_argument('--epochs_this_job', default=0, type=int,
+                          help=('Run for a number of epochs with no effect on the lr schedule.'
+                                'Useful for re-starting the training.'))
+    training.add_argument('--cudnn_benchmark', action='store_true', default=True,
+                          help='Enable cudnn benchmark')
+    training.add_argument('--amp', '--fp16', action='store_true', default=False,
+                          help='Use mixed precision training')
+    training.add_argument('--seed', default=None, type=int, help='Random seed')
+    training.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
+                          help='GPU id used for distributed training')
+    training.add_argument('--target', default=0.058, type=float, help='Target WER accuracy')
+    training.add_argument('--weights_init_scale', default=0.5, type=float, help='If set, overwrites value in config.')
+    training.add_argument('--hidden_hidden_bias_scale', type=float, help='If set, overwrites value in config.')
+
+    optim = parser.add_argument_group('optimization setup')
+    optim.add_argument('--batch_size', default=128, type=int,
+                       help='Effective batch size per GPU (might require grad accumulation')
+    optim.add_argument('--val_batch_size', default=2, type=int,
+                       help='Evalution time batch size')
+    optim.add_argument('--lr', default=4e-3, type=float,
+                       help='Peak learning rate')
+    optim.add_argument("--min_lr", default=1e-5, type=float,
+                       help='minimum learning rate')
+    optim.add_argument("--lr_exp_gamma", default=0.935, type=float,
+                       help='gamma factor for exponential lr scheduler')
+    optim.add_argument('--weight_decay', default=1e-3, type=float,
+                       help='Weight decay for the optimizer')
+    optim.add_argument('--grad_accumulation_steps', default=8, type=int,
+                       help='Number of accumulation steps')
+    optim.add_argument('--log_norm', action='store_true',
+                       help='If enabled, gradient norms will be logged')
+    optim.add_argument('--clip_norm', default=1, type=float,
+                       help='If provided, gradients will be clipped above this norm')
+    optim.add_argument('--beta1', default=0.9, type=float, help='Beta 1 for optimizer')
+    optim.add_argument('--beta2', default=0.999, type=float, help='Beta 2 for optimizer')
+    optim.add_argument('--ema', type=float, default=0.999,
+                       help='Discount factor for exp averaging of model weights')
+
+    io = parser.add_argument_group('feature and checkpointing setup')
+    io.add_argument('--dali_device', type=str, choices=['cpu', 'gpu'],
+                    default='cpu', help='Use DALI pipeline for fast data processing')
+    io.add_argument('--resume', action='store_true',
+                    help='Try to resume from last saved checkpoint.')
+    io.add_argument('--ckpt', default=None, type=str,
+                    help='Path to a checkpoint for resuming training')
+    io.add_argument('--save_at_the_end', action='store_true',
+                    help='Saves model checkpoint at the end of training')
+    io.add_argument('--save_frequency', default=None, type=int,
+                    help='Checkpoint saving frequency in epochs')
+    io.add_argument('--keep_milestones', default=[], type=int, nargs='+',
+                    help='Milestone checkpoints to keep from removing')
+    io.add_argument('--save_best_from', default=200, type=int,
+                    help='Epoch on which to begin tracking best checkpoint (dev WER)')
+    io.add_argument('--val_frequency', default=1, type=int,
+                    help='Number of epochs between evaluations on dev set')
+    io.add_argument('--log_frequency', default=25, type=int,
+                    help='Number of steps between printing training stats')
+    io.add_argument('--prediction_frequency', default=None, type=int,
+                    help='Number of steps between printing sample decodings')
+    io.add_argument('--model_config', default='configs/baseline_v3-1023sp.yaml',
+                    type=str, required=True,
+                    help='Path of the model configuration file')
+    io.add_argument('--num_buckets', type=int, default=6,
+                    help='If provided, samples will be grouped by audio duration, '
+                         'to this number of backets, for each bucket, '
+                         'random samples are batched, and finally '
+                         'all batches are randomly shuffled')
+    io.add_argument('--train_manifests', type=str, required=True, nargs='+',
+                    help='Paths of the training dataset manifest file')
+    io.add_argument('--val_manifests', type=str, required=True, nargs='+',
+                    help='Paths of the evaluation datasets manifest files')
+    io.add_argument('--max_duration', type=float,
+                    help='Discard samples longer than max_duration')
+    io.add_argument('--dataset_dir', required=True, type=str,
+                    help='Root dir of dataset')
+    io.add_argument('--output_dir', type=str, required=True,
+                    help='Directory for logs and checkpoints')
+    io.add_argument('--log_file', type=str, default=None,
+                    help='Path to save the training logfile.')
+    io.add_argument('--max_symbol_per_sample', type=int, default=None,
+                    help='maximum number of symbols per sample can have during eval')
+    io.add_argument('--mlperf', action='store_true', help='Enable MLPerf Logging.')
+
+    # FB5 Logging
+    io.add_argument("--fb5logger", type=str, default=None)
+    io.add_argument("--fb5config", type=str, default="small")
+    return parser.parse_args()
+
+
+def apply_ema(model, ema_model, decay):
+    if not decay:
+        return
+
+    sd = getattr(model, 'module', model).state_dict()
+    for k, v in ema_model.state_dict().items():
+        v.copy_(decay * v + (1 - decay) * sd[k])
+
+
+@torch.no_grad()
+def evaluate(epoch, step, val_loader, val_feat_proc, detokenize,
+             ema_model, loss_fn, greedy_decoder, use_amp, args):
+
+    ema_model.eval()
+
+    start_time = time.time()
+    agg = {'losses': [], 'preds': [], 'txts': [], 'idx': []}
+    if args.mlperf:
+        logging.log_start(logging.constants.EVAL_START, metadata=dict(epoch_num=epoch))
+    for i, batch in enumerate(val_loader):
+        print(f'{val_loader.pipeline_type} evaluation: {i:>10}/{len(val_loader):<10}', end='\r')
+
+        audio, audio_lens, txt, txt_lens = batch
+
+        feats, feat_lens = val_feat_proc([audio, audio_lens])
+
+        log_probs, log_prob_lens = ema_model(feats, feat_lens, txt, txt_lens)
+        loss = loss_fn(log_probs[:, :log_prob_lens.max().item()],
+                       log_prob_lens, txt, txt_lens)
+
+        pred = greedy_decoder.decode(ema_model, feats, feat_lens)
+
+        agg['losses'] += helpers.gather_losses([loss.cpu()])
+        agg['preds'] += helpers.gather_predictions([pred], detokenize)
+        agg['txts'] += helpers.gather_transcripts([txt.cpu()], [txt_lens.cpu()], detokenize)
+
+    wer, loss = process_evaluation_epoch(agg)
+    if args.mlperf:
+        logging.log_event(logging.constants.EVAL_ACCURACY, value=wer, metadata=dict(epoch_num=epoch))
+        logging.log_end(logging.constants.EVAL_STOP, metadata=dict(epoch_num=epoch))
+
+    log((epoch,), step, 'dev_ema', {'loss': loss, 'wer': 100.0 * wer, 'took': time.time() - start_time})
+    ema_model.train()
+    return wer
+
+
+def main():
+
+    args = parse_args()
+
+    if args.mlperf:
+        logging.configure_logger('RNNT')
+        logging.log_start(logging.constants.INIT_START)
+
+    if args.fb5logger is not None:
+        fb5logger = FB5Logger(args.fb5logger)
+        fb5logger.header("RNN-T", "OOTB", "train", args.fb5config, score_metric=loggerconstants.EXPS)
+
+    assert(torch.cuda.is_available())
+    assert args.prediction_frequency is None or args.prediction_frequency % args.log_frequency == 0
+
+    torch.backends.cudnn.benchmark = args.cudnn_benchmark
+
+    # set up distributed training
+    multi_gpu = int(os.environ.get('WORLD_SIZE', 1)) > 1
+    if multi_gpu:
+        torch.cuda.set_device(args.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')
+        world_size = dist.get_world_size()
+        print_once(f'Distributed training with {world_size} GPUs\n')
+    else:
+        world_size = 1
+
+    if args.seed is not None:
+        if args.mlperf:
+            logging.log_event(logging.constants.SEED, value=args.seed)
+        torch.manual_seed(args.seed + args.local_rank)
+        np.random.seed(args.seed + args.local_rank)
+        random.seed(args.seed + args.local_rank)
+        # np_rng is used for buckets generation, and needs the same seed on every worker
+        np_rng = np.random.default_rng(seed=args.seed)
+
+    init_log(args)
+
+    cfg = config.load(args.model_config)
+    config.apply_duration_flags(cfg, args.max_duration)
+
+    assert args.grad_accumulation_steps >= 1
+    assert args.batch_size % args.grad_accumulation_steps == 0, \
+        f'{args.batch_size} % {args.grad_accumulation_steps} != 0'
+
+    batch_size = args.batch_size // args.grad_accumulation_steps
+    if args.mlperf:
+        logging.log_event(logging.constants.GRADIENT_ACCUMULATION_STEPS, value=args.grad_accumulation_steps)
+        logging.log_event(logging.constants.SUBMISSION_BENCHMARK, value=logging.constants.RNNT)
+        logging.log_event(logging.constants.SUBMISSION_ORG, value='my-organization')
+        logging.log_event(logging.constants.SUBMISSION_DIVISION, value=logging.constants.CLOSED)  # closed or open
+        logging.log_event(logging.constants.SUBMISSION_STATUS, value=logging.constants.ONPREM)  # on-prem/cloud/research
+        logging.log_event(logging.constants.SUBMISSION_PLATFORM, value='my platform')
+        logging.log_end(logging.constants.INIT_STOP)
+
+    if multi_gpu:
+        torch.distributed.barrier()
+    if args.mlperf:
+        logging.log_start(logging.constants.RUN_START)
+    if multi_gpu:
+        torch.distributed.barrier()
+
+    print_once('Setting up datasets...')
+    (
+        train_dataset_kw,
+        train_features_kw,
+        train_splicing_kw,
+        train_specaugm_kw,
+    ) = config.input(cfg, 'train')
+    (
+        val_dataset_kw,
+        val_features_kw,
+        val_splicing_kw,
+        val_specaugm_kw,
+    ) = config.input(cfg, 'val')
+    if args.mlperf:
+        logging.log_event(logging.constants.DATA_TRAIN_MAX_DURATION,
+                          value=train_dataset_kw['max_duration'])
+        logging.log_event(logging.constants.DATA_SPEED_PERTURBATON_MAX,
+                          value=train_dataset_kw['speed_perturbation']['max_rate'])
+        logging.log_event(logging.constants.DATA_SPEED_PERTURBATON_MIN,
+                          value=train_dataset_kw['speed_perturbation']['min_rate'])
+        logging.log_event(logging.constants.DATA_SPEC_AUGMENT_FREQ_N,
+                          value=train_specaugm_kw['freq_masks'])
+        logging.log_event(logging.constants.DATA_SPEC_AUGMENT_FREQ_MIN,
+                          value=train_specaugm_kw['min_freq'])
+        logging.log_event(logging.constants.DATA_SPEC_AUGMENT_FREQ_MAX,
+                          value=train_specaugm_kw['max_freq'])
+        logging.log_event(logging.constants.DATA_SPEC_AUGMENT_TIME_N,
+                          value=train_specaugm_kw['time_masks'])
+        logging.log_event(logging.constants.DATA_SPEC_AUGMENT_TIME_MIN,
+                          value=train_specaugm_kw['min_time'])
+        logging.log_event(logging.constants.DATA_SPEC_AUGMENT_TIME_MAX,
+                          value=train_specaugm_kw['max_time'])
+        logging.log_event(logging.constants.GLOBAL_BATCH_SIZE,
+                          value=batch_size * world_size * args.grad_accumulation_steps)
+
+    tokenizer_kw = config.tokenizer(cfg)
+    tokenizer = Tokenizer(**tokenizer_kw)
+
+    class PermuteAudio(torch.nn.Module):
+        def forward(self, x):
+            return (x[0].permute(2, 0, 1), *x[1:])
+
+    train_augmentations = torch.nn.Sequential(
+        train_specaugm_kw and features.SpecAugment(optim_level=args.amp, **train_specaugm_kw) or torch.nn.Identity(),
+        features.FrameSplicing(optim_level=args.amp, **train_splicing_kw),
+        PermuteAudio(),
+    )
+    val_augmentations = torch.nn.Sequential(
+        val_specaugm_kw and features.SpecAugment(optim_level=args.amp, **val_specaugm_kw) or torch.nn.Identity(),
+        features.FrameSplicing(optim_level=args.amp, **val_splicing_kw),
+        PermuteAudio(),
+    )
+
+    if args.mlperf:
+        logging.log_event(logging.constants.DATA_TRAIN_NUM_BUCKETS, value=args.num_buckets)
+
+    if args.num_buckets is not None:
+        sampler = dali_sampler.BucketingSampler(
+            args.num_buckets,
+            batch_size,
+            world_size,
+            args.epochs,
+            np_rng
+        )
+    else:
+        sampler = dali_sampler.SimpleSampler()
+
+    train_loader = DaliDataLoader(gpu_id=args.local_rank,
+                                  dataset_path=args.dataset_dir,
+                                  config_data=train_dataset_kw,
+                                  config_features=train_features_kw,
+                                  json_names=args.train_manifests,
+                                  batch_size=batch_size,
+                                  sampler=sampler,
+                                  grad_accumulation_steps=args.grad_accumulation_steps,
+                                  pipeline_type="train",
+                                  device_type=args.dali_device,
+                                  tokenizer=tokenizer)
+
+    val_loader = DaliDataLoader(gpu_id=args.local_rank,
+                                dataset_path=args.dataset_dir,
+                                config_data=val_dataset_kw,
+                                config_features=val_features_kw,
+                                json_names=args.val_manifests,
+                                batch_size=args.val_batch_size,
+                                sampler=dali_sampler.SimpleSampler(),
+                                pipeline_type="val",
+                                device_type=args.dali_device,
+                                tokenizer=tokenizer)
+
+    train_feat_proc = train_augmentations
+    val_feat_proc = val_augmentations
+
+    train_feat_proc.cuda()
+    val_feat_proc.cuda()
+
+    steps_per_epoch = len(train_loader) // args.grad_accumulation_steps
+
+    if args.mlperf:
+        logging.log_event(logging.constants.TRAIN_SAMPLES, value=train_loader.dataset_size)
+        logging.log_event(logging.constants.EVAL_SAMPLES, value=val_loader.dataset_size)
+
+    # set up the model
+    rnnt_config = config.rnnt(cfg)
+    rnnt_config['mlperf'] = args.mlperf
+    if args.mlperf:
+        logging.log_event(logging.constants.MODEL_WEIGHTS_INITIALIZATION_SCALE, value=args.weights_init_scale)
+    if args.weights_init_scale is not None:
+        rnnt_config['weights_init_scale'] = args.weights_init_scale
+    if args.hidden_hidden_bias_scale is not None:
+        rnnt_config['hidden_hidden_bias_scale'] = args.hidden_hidden_bias_scale
+    model = RNNT(n_classes=tokenizer.num_labels + 1, **rnnt_config)
+    model.cuda()
+    blank_idx = tokenizer.num_labels
+    loss_fn = RNNTLoss(blank_idx=blank_idx)
+    if args.mlperf:
+        logging.log_event(logging.constants.EVAL_MAX_PREDICTION_SYMBOLS, value=args.max_symbol_per_sample)
+    greedy_decoder = RNNTGreedyDecoder(blank_idx=blank_idx,
+                                       max_symbol_per_sample=args.max_symbol_per_sample)
+
+    print_once(f'Model size: {num_weights(model) / 10**6:.1f}M params\n')
+
+    opt_eps = 1e-9
+    if args.mlperf:
+        logging.log_event(logging.constants.OPT_NAME, value='lamb')
+        logging.log_event(logging.constants.OPT_BASE_LR, value=args.lr)
+        logging.log_event(logging.constants.OPT_LAMB_EPSILON, value=opt_eps)
+        logging.log_event(logging.constants.OPT_LAMB_LR_DECAY_POLY_POWER, value=args.lr_exp_gamma)
+        logging.log_event(logging.constants.OPT_LR_WARMUP_EPOCHS, value=args.warmup_epochs)
+        logging.log_event(logging.constants.OPT_LAMB_LR_HOLD_EPOCHS, value=args.hold_epochs)
+        logging.log_event(logging.constants.OPT_LAMB_BETA_1, value=args.beta1)
+        logging.log_event(logging.constants.OPT_LAMB_BETA_2, value=args.beta2)
+        logging.log_event(logging.constants.OPT_GRADIENT_CLIP_NORM, value=args.clip_norm)
+        logging.log_event(logging.constants.OPT_LR_ALT_DECAY_FUNC, value=True)
+        logging.log_event(logging.constants.OPT_LR_ALT_WARMUP_FUNC, value=True)
+        logging.log_event(logging.constants.OPT_LAMB_LR_MIN, value=args.min_lr)
+        logging.log_event(logging.constants.OPT_WEIGHT_DECAY, value=args.weight_decay)
+
+    # optimization
+    kw = {'params': model.param_groups(args.lr), 'lr': args.lr,
+          'weight_decay': args.weight_decay}
+
+    initial_lrs = [group['lr'] for group in kw['params']]
+
+    print_once(f'Starting with LRs: {initial_lrs}')
+    optimizer = FusedLAMB(betas=(args.beta1, args.beta2), eps=opt_eps, max_grad_norm=args.clip_norm, **kw)
+
+    adjust_lr = lambda step, epoch: lr_policy(
+        step, epoch, initial_lrs, optimizer, steps_per_epoch=steps_per_epoch,
+        warmup_epochs=args.warmup_epochs, hold_epochs=args.hold_epochs,
+        min_lr=args.min_lr, exp_gamma=args.lr_exp_gamma)
+
+    if args.amp:
+        model, optimizer = amp.initialize(
+            models=model,
+            optimizers=optimizer,
+            opt_level='O1',
+            max_loss_scale=512.0)
+
+    if args.ema > 0:
+        ema_model = copy.deepcopy(model).cuda()
+    else:
+        ema_model = None
+    if args.mlperf:
+        logging.log_event(logging.constants.MODEL_EVAL_EMA_FACTOR, value=args.ema)
+
+    if multi_gpu:
+        model = DistributedDataParallel(model)
+
+    # load checkpoint
+    meta = {'best_wer': 10**6, 'start_epoch': 0}
+    checkpointer = Checkpointer(args.output_dir, 'RNN-T',
+                                args.keep_milestones, args.amp)
+    if args.resume:
+        args.ckpt = checkpointer.last_checkpoint() or args.ckpt
+
+    if args.ckpt is not None:
+        checkpointer.load(args.ckpt, model, ema_model, optimizer, meta)
+
+    start_epoch = meta['start_epoch']
+    best_wer = meta['best_wer']
+    last_wer = meta['best_wer']
+    epoch = 1
+    step = start_epoch * steps_per_epoch + 1
+
+    # FB5 Log for a certain amount of time.
+    if args.fb5logger is not None:
+        fb5logger.run_start()
+    total_batches = 0
+    start_time = time.time()
+    MAX_TIME = 120.0
+    # Start Batch Loop
+
+    # training loop
+    model.train()
+    for epoch in range(start_epoch + 1, args.epochs + 1):
+        if args.mlperf:
+            logging.log_start(logging.constants.BLOCK_START,
+                              metadata=dict(first_epoch_num=epoch,
+                                            epoch_count=1))
+            logging.log_start(logging.constants.EPOCH_START,
+                              metadata=dict(epoch_num=epoch))
+
+        epoch_utts = 0
+        accumulated_batches = 0
+        epoch_start_time = time.time()
+
+        for batch in train_loader:
+
+            if accumulated_batches == 0:
+                adjust_lr(step, epoch)
+                optimizer.zero_grad()
+                step_utts = 0
+                step_start_time = time.time()
+                all_feat_lens = []
+
+            audio, audio_lens, txt, txt_lens = batch
+
+            feats, feat_lens = train_feat_proc([audio, audio_lens])
+            all_feat_lens += feat_lens
+
+            log_probs, log_prob_lens = model(feats, feat_lens, txt, txt_lens)
+            loss = loss_fn(log_probs[:, :log_prob_lens.max().item()],
+                           log_prob_lens, txt, txt_lens)
+
+            loss /= args.grad_accumulation_steps
+
+            del log_probs, log_prob_lens
+
+            if torch.isnan(loss).any():
+                print_once('WARNING: loss is NaN; skipping update')
+            else:
+                if args.amp:
+                    with amp.scale_loss(loss, optimizer) as scaled_loss:
+                        scaled_loss.backward()
+                else:
+                    loss.backward()
+                loss_item = loss.item()
+                del loss
+                step_utts += batch[0].size(0) * world_size
+                epoch_utts += batch[0].size(0) * world_size
+                accumulated_batches += 1
+                total_batches += 1
+
+            if accumulated_batches % args.grad_accumulation_steps == 0:
+
+                total_norm = 0.0
+
+                try:
+                    if args.log_norm:
+                        for p in getattr(model, 'module', model).parameters():
+                            param_norm = p.grad.data.norm(2)
+                            total_norm += param_norm.item() ** 2
+                        total_norm = total_norm ** (1. / 2)
+                except AttributeError as e:
+                    print_once(f'Exception happened: {e}')
+                    total_norm = 0.0
+
+                optimizer.step()
+                apply_ema(model, ema_model, args.ema)
+
+                if step % args.log_frequency == 0 or (time.time() - start_time) > MAX_TIME:
+
+                    if args.prediction_frequency is None or step % args.prediction_frequency == 0:
+                        preds = greedy_decoder.decode(model, feats, feat_lens)
+                        wer, pred_utt, ref = greedy_wer(preds,
+                                                        txt,
+                                                        txt_lens,
+                                                        tokenizer.detokenize)
+                        print_once(f'  Decoded:   {pred_utt[:90]}')
+                        print_once(f'  Reference: {ref[:90]}')
+                        wer = {'wer': 100 * wer}
+                    else:
+                        wer = {}
+
+                    step_time = time.time() - step_start_time
+
+                    log((epoch, step % steps_per_epoch or steps_per_epoch, steps_per_epoch),
+                        step, 'train',
+                        {'loss': loss_item,
+                         **wer,  # optional entry
+                         'throughput': step_utts / step_time,
+                         'took': step_time,
+                         'grad-norm': total_norm,
+                         'seq-len-min': min(all_feat_lens).item(),
+                         'seq-len-max': max(all_feat_lens).item(),
+                         'lrate': optimizer.param_groups[0]['lr']})
+
+                    # FB5 Logger
+                    if (time.time() - start_time) > MAX_TIME:
+                        break
+
+                step_start_time = time.time()
+
+                step += 1
+                accumulated_batches = 0
+                # end of step
+        if args.mlperf:
+            logging.log_end(logging.constants.EPOCH_STOP,
+                            metadata=dict(epoch_num=epoch))
+
+        epoch_time = time.time() - epoch_start_time
+        log((epoch,), None, 'train_avg', {'throughput': epoch_utts / epoch_time,
+                                          'took': epoch_time})
+
+        # FB5 Logger
+        if (time.time() - start_time) > MAX_TIME:
+            break
+
+        if epoch % args.val_frequency == 0:
+            wer = evaluate(epoch, step, val_loader, val_feat_proc,
+                           tokenizer.detokenize, ema_model, loss_fn,
+                           greedy_decoder, args.amp, args)
+
+            last_wer = wer
+            if wer < best_wer and epoch >= args.save_best_from:
+                checkpointer.save(model, ema_model, optimizer, epoch,
+                                  step, best_wer, is_best=True)
+                best_wer = wer
+
+        save_this_epoch = (args.save_frequency is not None and epoch % args.save_frequency == 0) \
+            or (epoch in args.keep_milestones)
+        if save_this_epoch:
+            checkpointer.save(model, ema_model, optimizer, epoch, step, best_wer)
+        if args.mlperf:
+            logging.log_end(logging.constants.BLOCK_STOP, metadata=dict(first_epoch_num=epoch))
+
+        if last_wer <= args.target:
+            if args.mlperf:
+                logging.log_end(logging.constants.RUN_STOP, metadata={'status': 'success'})
+            if args.fb5logger is not None:
+                fb5logger.run_stop(total_batches, args.batch_size)
+            print_once(f'Finished after {args.epochs_this_job} epochs.')
+            break
+        if 0 < args.epochs_this_job <= epoch - start_epoch:
+            print_once(f'Finished after {args.epochs_this_job} epochs.')
+            break
+        # end of epoch
+
+    log((), None, 'train_avg', {'throughput': epoch_utts / epoch_time})
+
+    if last_wer > args.target:
+        if args.mlperf:
+            logging.log_end(logging.constants.RUN_STOP, metadata={'status': 'aborted'})
+        if args.fb5logger is not None:
+            fb5logger.run_stop(total_batches, args.batch_size)
+
+    if epoch == args.epochs:
+        evaluate(epoch, step, val_loader, val_feat_proc, tokenizer.detokenize,
+                 ema_model, loss_fn, greedy_decoder, args.amp, args)
+
+    flush_log()
+    if args.save_at_the_end:
+        checkpointer.save(model, ema_model, optimizer, epoch, step, best_wer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/rnnt/ootb/train/utils/__init__.py b/benchmarks/rnnt/ootb/train/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/rnnt/ootb/train/utils/convert_librispeech.py b/benchmarks/rnnt/ootb/train/utils/convert_librispeech.py
new file mode 100644
index 0000000..9149975
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/utils/convert_librispeech.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#!/usr/bin/env python
+import argparse
+import os
+import glob
+import multiprocessing
+import json
+
+import pandas as pd
+
+from preprocessing_utils import parallel_preprocess
+
+parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.')
+parser.add_argument('--input_dir', type=str, required=True,
+                    help='LibriSpeech collection input dir')
+parser.add_argument('--dest_dir', type=str, required=True,
+                    help='Output dir')
+parser.add_argument('--output_json', type=str, default='./',
+                    help='name of the output json file.')
+parser.add_argument('-s','--speed', type=float, nargs='*',
+                    help='Speed perturbation ratio')
+parser.add_argument('--target_sr', type=int, default=None,
+                    help='Target sample rate. '
+                         'defaults to the input sample rate')
+parser.add_argument('--overwrite', action='store_true',
+                    help='Overwrite file if exists')
+parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(),
+                    help='Number of threads to use when processing audio files')
+args = parser.parse_args()
+
+args.input_dir = args.input_dir.rstrip('/')
+args.dest_dir = args.dest_dir.rstrip('/')
+
+def build_input_arr(input_dir):
+    txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
+                          recursive=True)
+    input_data = []
+    for txt_file in txt_files:
+        rel_path = os.path.relpath(txt_file, input_dir)
+        with open(txt_file) as fp:
+            for line in fp:
+                fname, _, transcript = line.partition(' ')
+                input_data.append(dict(input_relpath=os.path.dirname(rel_path),
+                                       input_fname=fname+'.flac',
+                                       transcript=transcript))
+    return input_data
+
+
+print("[%s] Scaning input dir..." % args.output_json)
+dataset = build_input_arr(input_dir=args.input_dir)
+
+print("[%s] Converting audio files..." % args.output_json)
+dataset = parallel_preprocess(dataset=dataset,
+                              input_dir=args.input_dir,
+                              dest_dir=args.dest_dir,
+                              target_sr=args.target_sr,
+                              speed=args.speed,
+                              overwrite=args.overwrite,
+                              parallel=args.parallel)
+
+print("[%s] Generating json..." % args.output_json)
+df = pd.DataFrame(dataset, dtype=object)
+
+# Save json with python. df.to_json() produces back slashed in file paths
+dataset = df.to_dict(orient='records')
+with open(args.output_json, 'w') as fp:
+    json.dump(dataset, fp, indent=2)
diff --git a/benchmarks/rnnt/ootb/train/utils/download_librispeech.py b/benchmarks/rnnt/ootb/train/utils/download_librispeech.py
new file mode 100644
index 0000000..ad36ad4
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/utils/download_librispeech.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+
+import os
+import argparse
+import pandas as pd
+
+from download_utils import download_file, md5_checksum, extract
+
+parser = argparse.ArgumentParser(description='Download, verify and extract dataset files')
+parser.add_argument('csv', type=str,
+                    help='CSV file with urls and checksums to download.')
+parser.add_argument('dest', type=str,
+                    help='Download destnation folder.')
+parser.add_argument('-e', type=str, default=None,
+                    help='Extraction destnation folder. Defaults to download folder if not provided')
+parser.add_argument('--skip_download', action='store_true',
+                    help='Skip downloading the files')
+parser.add_argument('--skip_checksum', action='store_true',
+                    help='Skip checksum')
+parser.add_argument('--skip_extract', action='store_true',
+                    help='Skip extracting files')
+args = parser.parse_args()
+args.e = args.e or args.dest
+
+
+df = pd.read_csv(args.csv, delimiter=',')
+
+
+if not args.skip_download:
+    for url in df.url:
+        fname = url.split('/')[-1]
+        print("Downloading %s:" % fname)
+        download_file(url=url, dest_folder=args.dest, fname=fname)
+else:
+    print("Skipping file download")
+
+
+if not args.skip_checksum:
+    for index, row in df.iterrows():
+        url = row['url']
+        md5 = row['md5']
+        fname = url.split('/')[-1]
+        fpath = os.path.join(args.dest, fname)
+        print("Verifing %s: " % fname, end='')
+        ret = md5_checksum(fpath=fpath, target_hash=md5)
+        print("Passed" if ret else "Failed")
+else:
+    print("Skipping checksum")
+
+
+if not args.skip_extract:
+    for url in df.url:
+        fname = url.split('/')[-1]
+        fpath = os.path.join(args.dest, fname)
+        print("Decompressing %s:" % fpath)
+        extract(fpath=fpath, dest_folder=args.e)
+else:
+    print("Skipping file extraction")
diff --git a/benchmarks/rnnt/ootb/train/utils/download_utils.py b/benchmarks/rnnt/ootb/train/utils/download_utils.py
new file mode 100644
index 0000000..e881388
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/utils/download_utils.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+
+import hashlib
+import requests
+import os
+import tarfile
+import tqdm
+
+def download_file(url, dest_folder, fname, overwrite=False):
+    fpath = os.path.join(dest_folder, fname)
+    if os.path.isfile(fpath):
+        if overwrite:
+            print("Overwriting existing file")
+        else:
+            print("File exists, skipping download.")
+            return
+
+    tmp_fpath = fpath + '.tmp'
+
+    r = requests.get(url, stream=True)
+    file_size = int(r.headers['Content-Length'])
+    chunk_size = 1024 * 1024  # 1MB
+    total_chunks = int(file_size / chunk_size)
+
+    with open(tmp_fpath, 'wb') as fp:
+        content_iterator = r.iter_content(chunk_size=chunk_size)
+        chunks = tqdm.tqdm(content_iterator, total=total_chunks,
+                           unit='MB', desc=fpath, leave=True)
+        for chunk in chunks:
+            fp.write(chunk)
+
+    os.rename(tmp_fpath, fpath)
+
+
+def md5_checksum(fpath, target_hash):
+    file_hash = hashlib.md5()
+    with open(fpath, "rb") as fp:
+        for chunk in iter(lambda: fp.read(1024*1024), b""):
+            file_hash.update(chunk)
+    return file_hash.hexdigest() == target_hash
+
+
+def extract(fpath, dest_folder):
+    if fpath.endswith('.tar.gz'):
+        mode = 'r:gz'
+    elif fpath.endswith('.tar'):
+        mode = 'r:'
+    else:
+        raise IOError('fpath has unknown extention: %s' % fpath)
+
+    with tarfile.open(fpath, mode) as tar:
+        members = tar.getmembers()
+        for member in tqdm.tqdm(iterable=members, total=len(members), leave=True):
+            tar.extract(path=dest_folder, member=member)
diff --git a/benchmarks/rnnt/ootb/train/utils/inference_librispeech.csv b/benchmarks/rnnt/ootb/train/utils/inference_librispeech.csv
new file mode 100644
index 0000000..40dac4e
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/utils/inference_librispeech.csv
@@ -0,0 +1,5 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
+http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
+http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
+http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
diff --git a/benchmarks/rnnt/ootb/train/utils/librispeech.csv b/benchmarks/rnnt/ootb/train/utils/librispeech.csv
new file mode 100644
index 0000000..d48a9f8
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/utils/librispeech.csv
@@ -0,0 +1,8 @@
+url,md5
+http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
+http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
+http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
+http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
+http://www.openslr.org/resources/12/train-clean-100.tar.gz,2a93770f6d5c6c964bc36631d331a522
+http://www.openslr.org/resources/12/train-clean-360.tar.gz,c0e676e450a7ff2f54aeade5171606fa
+http://www.openslr.org/resources/12/train-other-500.tar.gz,d1a0fd59409feb2c614ce4d30c387708
diff --git a/benchmarks/rnnt/ootb/train/utils/preprocessing_utils.py b/benchmarks/rnnt/ootb/train/utils/preprocessing_utils.py
new file mode 100644
index 0000000..15605ce
--- /dev/null
+++ b/benchmarks/rnnt/ootb/train/utils/preprocessing_utils.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python
+import os
+import multiprocessing
+import librosa
+import functools
+
+import sox
+
+
+from tqdm import tqdm
+
+def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None,
+               overwrite=True):
+    speed = speed or []
+    speed.append(1)
+    speed = list(set(speed))  # Make uniqe
+
+    input_fname = os.path.join(input_dir,
+                               data['input_relpath'],
+                               data['input_fname'])
+    input_sr = sox.file_info.sample_rate(input_fname)
+    target_sr = target_sr or input_sr
+
+    os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True)
+
+    output_dict = {}
+    output_dict['transcript'] = data['transcript'].lower().strip()
+    output_dict['files'] = []
+
+    fname = os.path.splitext(data['input_fname'])[0]
+    for s in speed:
+        output_fname = fname + '{}.wav'.format('' if s==1 else '-{}'.format(s))
+        output_fpath = os.path.join(dest_dir,
+                                    data['input_relpath'],
+                                    output_fname)
+
+        if not os.path.exists(output_fpath) or overwrite:
+            cbn = sox.Transformer().speed(factor=s).convert(target_sr)
+            cbn.build(input_fname, output_fpath)
+
+        file_info = sox.file_info.info(output_fpath)
+        file_info['fname'] = os.path.join(os.path.basename(dest_dir),
+                                          data['input_relpath'],
+                                          output_fname)
+        file_info['speed'] = s
+        output_dict['files'].append(file_info)
+
+        if s == 1:
+            file_info = sox.file_info.info(output_fpath)
+            output_dict['original_duration'] = file_info['duration']
+            output_dict['original_num_samples'] = file_info['num_samples']
+
+    return output_dict
+
+
+def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel):
+    with multiprocessing.Pool(parallel) as p:
+        func = functools.partial(preprocess,
+            input_dir=input_dir, dest_dir=dest_dir,
+            target_sr=target_sr, speed=speed, overwrite=overwrite)
+        dataset = list(tqdm(p.imap(func, dataset), total=len(dataset)))
+        return dataset
diff --git a/benchmarks/run_all.sh b/benchmarks/run_all.sh
new file mode 100755
index 0000000..cfb716f
--- /dev/null
+++ b/benchmarks/run_all.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Run all major benchmarks with tiny configs as an example
+# Also used as a test script to make sure benchmarks run correctly.
+
+# DLRM OOTB
+./run_dlrm_ootb_infer.sh -l results
+./run_dlrm_ootb_train.sh -l results # ootb configs use config files. See docs/DLRM.md
+
+# DLRM UBench
+./run_dlrm_ubench_train_linear.sh -c "[(2,2,2,2,2)]" -l results # Config not real
+./run_dlrm_ubench_train_embeddingbag.sh -l results -c "[(2,2,2,2),(2,2,2,2),(2,2,2,2),(2,2,2,2),(2,2,2,2)]" # Config not real
+
+# XLMR OOTB
+./run_xlmr_ootb.sh 
+
+# view options: [raw_view -> pure json, intermediate_view -> nice table]
+# intermediate view recommended for filling out table
+python ../fb5logging/result_summarizer.py -f results -v intermediate_view
\ No newline at end of file
diff --git a/benchmarks/run_dlrm_ootb_infer.sh b/benchmarks/run_dlrm_ootb_infer.sh
new file mode 100755
index 0000000..84d4841
--- /dev/null
+++ b/benchmarks/run_dlrm_ootb_infer.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+printUsage() {
+  echo
+  echo "Usage: $(basename "$0") <options>"
+  echo
+  echo "Options:"
+  echo "  -h                      Prints this help."
+  echo "  -l <dir to save log>    Saves FB5 Log to specified directory in first argument."
+  echo "  -c <config file>        Runs the command in the config file instead of the default config."
+  echo
+  return 0
+}
+
+if [ "$1" == "" ]; then
+  printUsage
+  exit 0
+fi
+
+# Default values
+benchmark=dlrm
+implementation=ootb
+mode=eval
+config=tiny # default is tiny, proof of concept
+is_config=false
+
+while getopts "hl:c:" flag ;
+do
+  case "${flag}" in
+    h)
+      printUsage ; exit 0 ;;
+    l)
+      LOG_DIR=${OPTARG} ;;
+    c)
+      config=${OPTARG} ; is_config=true ;;
+  esac
+done
+
+LOGGER_FILE="${LOG_DIR}/${benchmark}_${implementation}_${mode}_${config}.log"
+
+echo "=== Launching FB5 ==="
+echo "Benchmark: ${benchmark}"
+echo "Implementation: ${implementation}"
+echo "Mode: ${mode}"
+echo "Config: ${config}"
+echo "Saving FB5 Logger File: ${LOGGER_FILE}"
+echo
+echo "Running Command:"
+
+if [ "$is_config" = true ]; then
+  config_flags=$(head -n 1 "${config}")
+  (set -x; python "${benchmark}/${implementation}/dlrm_s_pytorch.py" --inference-only ${config_flags} --fb5logger=${LOGGER_FILE} --fb5config=${config} 2>&1)
+else
+  (set -x; python "${benchmark}/${implementation}/dlrm_s_pytorch.py" --inference-only --mini-batch-size=64 --test-mini-batch-size=64 --test-num-workers=0 --data-generation=random --arch-mlp-bot=512-512-64 --arch-mlp-top=1024-1024-1024-1 --arch-sparse-feature-size=64 --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup=100 --numpy-rand-seed=727 --num-batches=200 --print-freq=20 --print-time --fb5logger=${LOGGER_FILE} --fb5config=${config} 2>&1)
+fi
+
+echo "=== Completed Run ==="
diff --git a/benchmarks/run_dlrm_ootb_train.sh b/benchmarks/run_dlrm_ootb_train.sh
new file mode 100755
index 0000000..fdd6d50
--- /dev/null
+++ b/benchmarks/run_dlrm_ootb_train.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+printUsage() {
+  echo
+  echo "Usage: $(basename "$0") <options>"
+  echo
+  echo "Options:"
+  echo "  -h                      Prints this help."
+  echo "  -l <dir to save log>    Saves FB5 Log to specified directory in first argument."
+  echo "  -c <config file>        Runs the command in the config file instead of the default config."
+  echo
+  return 0
+}
+
+if [ "$1" == "" ]; then
+  printUsage
+  exit 0
+fi
+
+# Default values
+benchmark=dlrm
+implementation=ootb
+mode=train
+config=tiny # default is tiny, proof of concept
+is_config=false
+
+while getopts "hl:c:" flag ;
+do
+  case "${flag}" in
+    h)
+      printUsage ; exit 0 ;;
+    l)
+      LOG_DIR=${OPTARG} ;;
+    c)
+      config=${OPTARG} ; is_config=true ;;
+  esac
+done
+
+LOGGER_FILE="${LOG_DIR}/${benchmark}_${implementation}_${mode}_${config}.log"
+
+echo "=== Launching FB5 ==="
+echo "Benchmark: ${benchmark}"
+echo "Implementation: ${implementation}"
+echo "Mode: ${mode}"
+echo "Config: ${config}"
+echo "Saving FB5 Logger File: ${LOGGER_FILE}"
+echo
+echo "Running Command:"
+
+if [ "$is_config" = true ]; then
+  config_flags=$(head -n 1 "${config}")
+  (set -x; python "${benchmark}/${implementation}/dlrm_s_pytorch.py" ${config_flags} --fb5logger=${LOGGER_FILE} --fb5config=${config} 2>&1)
+else
+  (set -x; python "${benchmark}/${implementation}/dlrm_s_pytorch.py" --mini-batch-size=64 --test-mini-batch-size=64 --test-num-workers=0 --data-generation=random --arch-mlp-bot=512-512-64 --arch-mlp-top=1024-1024-1024-1 --arch-sparse-feature-size=64 --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup=100 --numpy-rand-seed=727 --num-batches=200 --print-freq=20 --print-time --fb5logger=${LOGGER_FILE} --fb5config=${config} 2>&1)
+fi
+
+echo "=== Completed Run ==="
diff --git a/benchmarks/run_dlrm_ubench_train_allreduce.sh b/benchmarks/run_dlrm_ubench_train_allreduce.sh
new file mode 100644
index 0000000..672cee1
--- /dev/null
+++ b/benchmarks/run_dlrm_ubench_train_allreduce.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+printUsage() {
+  echo
+  echo "Usage: $(basename "$0") <options>"
+  echo
+  echo "Options:"
+  echo "  -h                      Prints this help."
+  echo "  -l <dir to save log>    Saves FB5 Log to specified directory in first argument."
+  echo "  -c <size>               Size in bytes"
+  echo
+  return 0
+}
+
+if [ "$1" == "" ]; then
+  printUsage
+  exit 0
+fi
+
+# Default values
+benchmark=dlrm
+implementation=ubench
+mode=train
+collective=allreduce 
+size=small
+
+while getopts "hl:c:" flag ;
+do
+  case "${flag}" in
+    h)
+      printUsage ; exit 0 ;;
+    l)
+      LOG_DIR=${OPTARG} ;;
+    c)
+      size=${OPTARG} ; size_specified=true ;;
+  esac
+done
+
+size_name=${size}
+if [ $size -eq 2200 ]; then
+  size_name=small
+elif [ $size -eq 9944 ]; then
+  size_name=medium
+elif [ $size -eq 22372 ]; then
+  size_name=large
+fi
+
+LOGGER_FILE="${LOG_DIR}/${benchmark}_${implementation}_${mode}_${collective}_${size_name}.log"
+
+echo "=== Launching FB5 ==="
+echo "Benchmark: ${benchmark}"
+echo "Implementation: ${implementation}"
+echo "Mode: ${mode}"
+echo "Collective: ${collective}"
+echo "Size: ${size}"
+echo "Saving FB5 Logger File: ${LOGGER_FILE}"
+echo "Running Command:"
+
+(set -x; python3 "${benchmark}/${implementation}/dlrm_ubench_comms_driver.py" --fb5logger=${LOGGER_FILE} --collective=all_reduce --size=${size} 2>&1)
+
+echo "=== Completed Run ==="
diff --git a/benchmarks/run_dlrm_ubench_train_alltoall.sh b/benchmarks/run_dlrm_ubench_train_alltoall.sh
new file mode 100755
index 0000000..53e9f48
--- /dev/null
+++ b/benchmarks/run_dlrm_ubench_train_alltoall.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+printUsage() {
+  echo
+  echo "Usage: $(basename "$0") <options>"
+  echo
+  echo "Options:"
+  echo "  -h                      Prints this help."
+  echo "  -l <dir to save log>    Saves FB5 Log to specified directory in first argument."
+  echo "  -c <size>               Size in bytes"
+  echo
+  return 0
+}
+
+if [ "$1" == "" ]; then
+  printUsage
+  exit 0
+fi
+
+# Default values
+benchmark=dlrm
+implementation=ubench
+mode=train
+collective=alltoall 
+size=small
+
+while getopts "hl:c:" flag ;
+do
+  case "${flag}" in
+    h)
+      printUsage ; exit 0 ;;
+    l)
+      LOG_DIR=${OPTARG} ;;
+    c)
+      size=${OPTARG} ; size_specified=true ;;
+  esac
+done
+
+size_name=${size}
+if [ $size -eq 134000000 ]; then
+  size_name=small
+elif [ $size -eq 244000000 ]; then
+  size_name=medium
+elif [ $size -eq 544000000 ]; then
+  size_name=large
+fi
+
+LOGGER_FILE="${LOG_DIR}/${benchmark}_${implementation}_${mode}_${collective}_${size_name}.log"
+
+echo "=== Launching FB5 ==="
+echo "Benchmark: ${benchmark}"
+echo "Implementation: ${implementation}"
+echo "Mode: ${mode}"
+echo "Collective: ${collective}"
+echo "Size: ${size}"
+echo "Saving FB5 Logger File: ${LOGGER_FILE}"
+echo "Running Command:"
+
+(set -x; python3 "${benchmark}/${implementation}/dlrm_ubench_comms_driver.py" --fb5logger=${LOGGER_FILE} --collective=all_to_all --size=${size} 2>&1)
+
+echo "=== Completed Run ==="
diff --git a/benchmarks/run_dlrm_ubench_train_embeddingbag.sh b/benchmarks/run_dlrm_ubench_train_embeddingbag.sh
new file mode 100755
index 0000000..de37695
--- /dev/null
+++ b/benchmarks/run_dlrm_ubench_train_embeddingbag.sh
@@ -0,0 +1,36 @@
+steps=100
+device='cpu'
+dataset='A'
+
+usage() { echo "Usage: $0 [-s <steps>] [-d <'cpu'|'gpu'>] [-l <dir to save log>]"; exit 1; }
+
+while getopts "s:d:l:c:h" flag
+do
+    case "${flag}" in
+        s) steps=${OPTARG};;
+        d) device=${OPTARG};;
+        l) LOG_DIR=${OPTARG} ;;
+        c) dataset=${OPTARG} ;;
+        h) usage
+    esac
+done
+shift $((OPTIND-1))
+
+benchmark=dlrm
+implementation=ubench
+mode=train
+config=embeddingbag_${dataset}
+LOGGER_FILE="${LOG_DIR}/${benchmark}_${implementation}_${mode}_${config}.log"
+
+echo "=== Launching FB5 ==="
+echo "Benchmark: ${benchmark}"
+echo "Implementation: ${implementation}"
+echo "Mode: ${mode}"
+echo "Config: ${config}"
+echo "Saving FB5 Logger File: ${LOGGER_FILE}"
+echo
+echo "Running Command:"
+
+(set -x; python dlrm/ubench/dlrm_ubench_train_driver.py --steps=$steps --device=$device --fb5logger=${LOGGER_FILE} emb --dataset="${dataset}" 2>&1)
+
+echo "=== Completed Run ==="
diff --git a/benchmarks/run_dlrm_ubench_train_linear.sh b/benchmarks/run_dlrm_ubench_train_linear.sh
new file mode 100755
index 0000000..9f230bb
--- /dev/null
+++ b/benchmarks/run_dlrm_ubench_train_linear.sh
@@ -0,0 +1,36 @@
+steps=100
+device='cpu'
+dataset='A'
+
+usage() { echo "Usage: $0 [-s <steps>] [-d <'cpu'|'gpu'>] [-l <dir to save log>]"; exit 1; }
+
+while getopts "s:d:l:c:h" flag
+do
+    case "${flag}" in
+        s) steps=${OPTARG};;
+        d) device=${OPTARG};;
+        l) LOG_DIR=${OPTARG} ;;
+        c) dataset=${OPTARG} ;;
+        h) usage
+    esac
+done
+shift $((OPTIND-1))
+
+benchmark=dlrm
+implementation=ubench
+mode=train
+config=linear_${dataset}
+LOGGER_FILE="${LOG_DIR}/${benchmark}_${implementation}_${mode}_${config}.log"
+
+echo "=== Launching FB5 ==="
+echo "Benchmark: ${benchmark}"
+echo "Implementation: ${implementation}"
+echo "Mode: ${mode}"
+echo "Config: ${config}"
+echo "Saving FB5 Logger File: ${LOGGER_FILE}"
+echo
+echo "Running Command:"
+
+(set -x; python dlrm/ubench/dlrm_ubench_train_driver.py --steps=$steps --device=$device --fb5logger=${LOGGER_FILE} linear --dataset="${dataset}" 2>&1)
+
+echo "=== Completed Run ==="
diff --git a/benchmarks/run_rnnt_ootb_infer.sh b/benchmarks/run_rnnt_ootb_infer.sh
new file mode 100755
index 0000000..66ed79e
--- /dev/null
+++ b/benchmarks/run_rnnt_ootb_infer.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+printUsage() {
+  echo
+  echo "Usage: $(basename "$0") <options>"
+  echo
+  echo "Options:"
+  echo "  -h                      Prints this help."
+  echo "  -l <dir to save log>    Saves FB5 Log to specified directory in first argument."
+  echo "  -c <config file>        Runs the command in the config file instead of the default config."
+  echo
+  return 0
+}
+
+if [ "$1" == "" ]; then
+  printUsage
+  exit 0
+fi
+
+# Default values
+benchmark=rnnt
+implementation=ootb
+mode=eval
+config=tiny # default is tiny, proof of concept
+is_config=false
+
+while getopts "hl:c:" flag ;
+do
+  case "${flag}" in
+    h)
+      printUsage ; exit 0 ;;
+    l)
+      LOG_DIR=${OPTARG} ;;
+    c)
+      config=${OPTARG} ; is_config=true ;;
+  esac
+done
+
+# Resolve to absolute directory to support both relative and absolute dirs.
+ABSOLUTE_LOG_DIR=`readlink -f ${LOG_DIR}`
+LOGGER_FILE="${ABSOLUTE_LOG_DIR}/${benchmark}_${implementation}_${mode}_${config}.log"
+
+echo "=== Launching FB5 ==="
+echo "Benchmark: ${benchmark}"
+echo "Implementation: ${implementation}"
+echo "Mode: ${mode}"
+echo "Config: ${config}"
+echo "Saving FB5 Logger File: ${LOGGER_FILE}"
+echo
+echo "Running Command:"
+
+if [[ -z "$DATASET_DIR" ]]; then
+  echo "ERROR: DATASET_DIR not set!"
+  exit 1
+fi
+if [[ -z "$RESULT_DIR" ]]; then
+  echo "ERROR: RESULT_DIR not set!"
+  exit 1
+fi
+
+benchmark_root=${benchmark}/${implementation}/inference
+if [ "$is_config" = true ]; then
+  config_flags=$(head -n 1 "${config}")
+  (set -x; python ${benchmark_root}/run.py --backend pytorch --dataset_dir ${DATASET_DIR}/LibriSpeech --manifest ${DATASET_DIR}/LibriSpeech/librispeech-dev-clean-wav.json --pytorch_config_toml ${benchmark_root}/pytorch/configs/rnnt.toml --pytorch_checkpoint ${RESULT_DIR}/rnnt.pt --scenario Offline --log_dir ${RESULT_DIR}/Offline_pytorch_rerun --fb5logger ${LOGGER_FILE} --fb5config ${config} ${config_flags} 2>&1)
+else
+  (set -x; python ${benchmark_root}/run.py --backend pytorch --dataset_dir ${DATASET_DIR}/LibriSpeech --manifest ${DATASET_DIR}/LibriSpeech/librispeech-dev-clean-wav.json --pytorch_config_toml ${benchmark_root}/pytorch/configs/rnnt.toml --pytorch_checkpoint ${RESULT_DIR}/rnnt.pt --scenario Offline --log_dir ${RESULT_DIR}/Offline_pytorch_rerun --fb5logger ${LOGGER_FILE} --fb5config ${config} 2>&1)
+fi
+
+echo "=== Completed Run ==="
diff --git a/benchmarks/run_rnnt_ootb_train.sh b/benchmarks/run_rnnt_ootb_train.sh
new file mode 100755
index 0000000..02f3400
--- /dev/null
+++ b/benchmarks/run_rnnt_ootb_train.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+printUsage() {
+  echo
+  echo "Usage: $(basename "$0") <options>"
+  echo
+  echo "Options:"
+  echo "  -h                      Prints this help."
+  echo "  -l <dir to save log>    Saves FB5 Log to specified directory in first argument."
+  echo "  -c <config file>        Runs the command in the config file instead of the default config."
+  echo
+  return 0
+}
+
+if [ "$1" == "" ]; then
+  printUsage
+  exit 0
+fi
+
+# Default values
+benchmark=rnnt
+implementation=ootb
+mode=train
+config=tiny # default is tiny, proof of concept
+is_config=false
+
+while getopts "hl:c:" flag ;
+do
+  case "${flag}" in
+    h)
+      printUsage ; exit 0 ;;
+    l)
+      LOG_DIR=${OPTARG} ;;
+    c)
+      config=${OPTARG} ; is_config=true ;;
+  esac
+done
+
+# Resolve to absolute directory to support both relative and absolute dirs.
+ABSOLUTE_LOG_DIR=`readlink -f ${LOG_DIR}`
+LOGGER_FILE="${ABSOLUTE_LOG_DIR}/${benchmark}_${implementation}_${mode}_${config}.log"
+
+echo "=== Launching FB5 ==="
+echo "Benchmark: ${benchmark}"
+echo "Implementation: ${implementation}"
+echo "Mode: ${mode}"
+echo "Config: ${config}"
+echo "Saving FB5 Logger File: ${LOGGER_FILE}"
+echo
+echo "Running Command:"
+
+if [[ -z "$DATASET_DIR" ]]; then
+  echo "ERROR: DATASET_DIR not set!"
+  exit 1
+fi
+if [[ -z "$RESULT_DIR" ]]; then
+  echo "ERROR: RESULT_DIR not set!"
+  exit 1
+fi
+
+if [ "$is_config" = true ]; then
+  config_flags=$(head -n 1 "${config}")
+  (set -x; bash ${benchmark}/${implementation}/${mode}/scripts/train.sh ${DATASET_DIR}/LibriSpeech ${benchmark}/${implementation}/${mode}/configs/baseline_v3-1023sp.yaml ${RESULT_DIR} ${LOGGER_FILE} ${config} ${config_flags} 2>&1)
+else
+  (set -x; bash ${benchmark}/${implementation}/${mode}/scripts/train.sh ${DATASET_DIR}/LibriSpeech ${benchmark}/${implementation}/${mode}/configs/baseline_v3-1023sp.yaml ${RESULT_DIR} ${LOGGER_FILE} ${config} 2>&1)
+fi
+
+echo "=== Completed Run ==="
diff --git a/benchmarks/run_xlmr_ootb.sh b/benchmarks/run_xlmr_ootb.sh
new file mode 100755
index 0000000..09c41a6
--- /dev/null
+++ b/benchmarks/run_xlmr_ootb.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# fixed values
+benchmark=xlmr
+implementation=ootb
+
+# default values
+config_name=default-config
+nbatches=10
+batchsize=16
+seqlength=16
+vocabsize=250000
+LOG_DIR=results
+config_flags="--inference-only --num-batches=${nbatches} --batch-size=${batchsize} --sequence-length=${seqlength} --vocab-size=${vocabsize} --famconfig=${config_name} --half-model"
+
+while getopts "hl:c:" flag ;
+do
+  case "${flag}" in
+    h)
+      printUsage ; exit 0 ;;
+    l)
+      LOG_DIR=${OPTARG} ;;
+    c)
+      config_flags=${OPTARG} ;;
+  esac
+done
+
+(set -x; python "${benchmark}/${implementation}/xlmr.py" ${config_flags} --logdir=${LOG_DIR})
diff --git a/benchmarks/setup_rnnt.sh b/benchmarks/setup_rnnt.sh
new file mode 100644
index 0000000..c4e16fb
--- /dev/null
+++ b/benchmarks/setup_rnnt.sh
@@ -0,0 +1,93 @@
+#/bin/bash
+
+set -euox pipefail
+
+# Check for two main directory exports required for RNN-T.
+if [[ -z "$DATASET_DIR" ]]; then
+  echo "ERROR: DATASET_DIR not set! Please set using export DATASET_DIR=\"<path to dataset dir>\"!"
+  exit 1
+fi
+if [[ -z "$RESULT_DIR" ]]; then
+  echo "ERROR: RESULT_DIR not set! Please set using export RESULT_DIR=\"<path to result dir>\"!"
+  exit 1
+fi
+
+# Setting up the conda environment
+set +u
+source "$($CONDA_EXE info --base)/etc/profile.d/conda.sh"
+conda create -n proxy-rnnt python=3.8.3
+conda activate proxy-rnnt
+
+# Install PyTorch dependencies
+pip install requests bs4 argparse
+conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=11.0 -c pytorch
+
+# Switch to CUDA 11.0
+if [ ! -d "deps/switch-cuda" ]; then
+  git clone https://github.com/phohenecker/switch-cuda.git deps/switch-cuda
+fi
+source deps/switch-cuda/switch-cuda.sh 11.0
+export TORCH_CUDA_ARCH_LIST=8.0
+
+# Install required packages
+sudo apt-get install sox libsndfile1 jq numactl cmake
+pip install unidecode==1.1.1 inflect==4.1.0 pandas==1.1.5 sentencepiece==0.1.94 librosa==0.8.0 soundfile==0.10.3.post1 tensorboard==2.3.0 numba==0.48.0
+
+# Install dllogger and mlcommons logger
+pip install https://github.com/NVIDIA/dllogger/archive/26a0f8f1958de2c0c460925ff6102a4d2486d6cc.zip
+pip install https://github.com/mlcommons/logging/archive/d08740cadb4188a5ebeb84ad6c68f98c1e129805.zip
+
+# Install Nvidia Dali
+pip install --no-cache --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda110==0.28.0
+
+# Install Warp-Transducer library
+git clone https://github.com/HawkAaron/warp-transducer deps/warp-transducer
+cd deps/warp-transducer/
+git checkout f546575109111c455354861a0567c8aa794208a2
+sed -i 's/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")/#set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")/g' CMakeLists.txt
+sed -i 's/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")/g' CMakeLists.txt
+mkdir build
+cd build/
+cmake ..
+make -j32
+export CUDA_HOME="/usr/local/cuda"
+export WARP_RNNT_PATH=`pwd`
+export CUDA_TOOLKIT_ROOT_DIR="$CUDA_HOME"
+export LD_LIBRARY_PATH="$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+export LIBRARY_PATH="$CUDA_HOME/lib64:$LIBRARY_PATH"
+export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH"
+export CFLAGS="-I$CUDA_HOME/include $CFLAGS"
+cd ../pytorch_binding
+python3 setup.py install
+cd ../../..
+
+# Install Nvidia CuDNN
+conda install -c nvidia cudnn==8.0.4
+
+# Install apex
+pip install --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/8a1ed9e8d35dfad26fb973996319965e4224dcdd.zip
+
+# Other train deps
+pip install pyyaml
+
+# Changes to source code, add DATASET_DIR to rnnt/ootb/train/configs/baseline_v3-1024sp.yaml
+sed -i 's@sentpiece_model: <your $DATASET_DIR>/sentencepieces/librispeech1023.model@sentpiece_model: '"$DATASET_DIR"'/sentencepieces/librispeech1023.model@' rnnt/ootb/train/configs/baseline_v3-1023sp.yaml
+
+# Install MLPerf loadgen
+pushd rnnt/ootb/inference/loadgen
+python setup.py install
+popd
+
+# Install other inference dependencies
+pip install toml==0.10.0
+pip install tqdm==4.31.1
+
+# Download the pre-trained model
+mkdir -p $RESULT_DIR
+wget https://zenodo.org/record/3662521/files/DistributedDataParallel_1576581068.9962234-epoch-100.pt?download=1 -O $RESULT_DIR/rnnt.pt
+
+# Download and Extract LibriSpeech Dataset
+bash rnnt/ootb/train/scripts/download_librispeech.sh
+
+# Process the .flac files into .wav and .json
+bash rnnt/ootb/train/scripts/preprocess_librispeech.sh
diff --git a/benchmarks/xlmr/ootb/xlmr.py b/benchmarks/xlmr/ootb/xlmr.py
new file mode 100644
index 0000000..8f85e8d
--- /dev/null
+++ b/benchmarks/xlmr/ootb/xlmr.py
@@ -0,0 +1,137 @@
+import torch
+import argparse
+import sys
+import time
+import torch.nn.functional as F
+
+# FB5 Logger
+import pathlib
+from os import fspath
+p = pathlib.Path(__file__).parent.resolve() / "../../../fb5logging"
+sys.path.append(fspath(p))
+from fb5logger import FB5Logger
+
+# from fairseq.models.roberta import XLMRModel
+
+def time_ms(use_gpu):
+    """
+    Return time. If gpu is available, synchronize.
+    """
+    if use_gpu:
+        torch.cuda.synchronize()
+    return time.time_ns() * 1e-6
+
+def get_model():
+    # download from Internet
+    fairseq_xlmr_large = torch.hub.load('pytorch/fairseq:main', 'xlmr.large')
+
+    # load model weights file locally
+    # f = '/path/xlmr.large'
+    # fairseq_xlmr_large = XLMRModel.from_pretrained(f, checkpoint_file='model.pt')
+
+    # TODO use torchscript? jit/script this model?
+    return fairseq_xlmr_large.model
+
+def generate_ml_sample(batchsize=64, seq_length=64, vocab_size=250000, get_y_true=True):
+    shape = (batchsize, seq_length)
+    x = torch.rand(shape) * vocab_size
+    x = x.int()
+    if get_y_true:
+        y_true = torch.rand((batchsize, seq_length, 250002))
+        return [x, y_true]
+    else:
+        return x
+
+def evaluate_simple(model, x_l, use_gpu=False, famlogger=None):
+    """
+    Run data through the model
+    """
+    for x in x_l:
+        famlogger.batch_start()
+        if use_gpu:
+            x = x.cuda()
+        y_pred = model(x)
+        famlogger.batch_stop(time_ms=time_ms(use_gpu))
+
+def init_argparse() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Benchmark XLM-R model"
+    )
+    parser.add_argument("--logdir", type=str, default=None)
+    parser.add_argument("--inference-only", action="store_true", default=False)
+    parser.add_argument("--famconfig", type=str, default="tiny")
+    parser.add_argument("--use-gpu", action="store_true", default=False)
+    parser.add_argument("--num-batches", type=int, default=10)
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument("--sequence-length", type=int, default=64)
+    parser.add_argument("--vocab-size", type=int, default=250000)
+    parser.add_argument("--half-model", action="store_true", default=False)
+    return parser
+
+def run():
+    parser = init_argparse()
+    args = parser.parse_args()
+
+    # check for device
+    if(args.use_gpu):
+        assert torch.cuda.is_available(), "No cuda device is available."
+        device = torch.device("cuda", 0)
+
+    # prep logger
+    famlogger = None
+    if args.logdir is not None:
+        mode = "train"
+        if(args.inference_only):
+            mode = "eval"
+
+        logpath = "{}/XLMR_OOTB_{}_{}.log".format(args.logdir, mode, args.famconfig)
+        famlogger = FB5Logger(logpath)
+        famlogger.header("XLMR", "OOTB", mode, args.famconfig)
+
+    # prep model and data
+    xlmr = get_model()
+    if args.inference_only:
+        xlmr.eval()
+    if args.half_model:
+        xlmr.half()
+    
+    # use gpu
+    if args.use_gpu:
+        xlmr = xlmr.to(device)
+
+    print("generating data")
+    if args.inference_only:
+        x_l = [generate_ml_sample(batchsize=args.batch_size, seq_length=args.sequence_length, vocab_size=args.vocab_size, get_y_true=False) for _ in range(args.num_batches)]
+    else:
+        x_l, y_true_l = zip(*[generate_ml_sample(batchsize=args.batch_size, seq_length=args.sequence_length, vocab_size=args.vocab_size) for _ in range(args.num_batches)])
+    print("data generated")
+
+    # benchmark!
+    if famlogger is not None:
+        famlogger.run_start(time_ms=time_ms(args.use_gpu))
+    
+    if args.inference_only:
+        evaluate_simple(xlmr, x_l, use_gpu=args.use_gpu, famlogger=famlogger)
+    else:
+        #training loop
+        learning_rate = 0.01
+        optimizer = torch.optim.SGD(xlmr.parameters(), lr=learning_rate)
+        for x, y_true in zip(x_l, y_true_l):    
+            famlogger.batch_start()
+            if args.use_gpu:
+                x = x.to(device)
+                y_true = y_true.to(device)
+            y_pred = xlmr(x)
+            y_true = y_true.long()
+            loss = F.cross_entropy(y_pred[0], y_true[:,0,:]) # TODO: fix y_true data input hack
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad() 
+            famlogger.batch_stop(time_ms=time_ms(args.use_gpu))
+
+    if famlogger is not None:
+        famlogger.run_stop(0, 0, time_ms=time_ms(args.use_gpu))
+        famlogger.record_batch_info(num_batches=len(x_l), batch_size=len(x_l[0]))
+
+if __name__ == "__main__":
+    run()
diff --git a/benchmarks/xlmr/ootb/xlmr_extra.py b/benchmarks/xlmr/ootb/xlmr_extra.py
new file mode 100644
index 0000000..09bfff8
--- /dev/null
+++ b/benchmarks/xlmr/ootb/xlmr_extra.py
@@ -0,0 +1,31 @@
+"""
+Hold functions for xlmr todos
+"""
+from torchtext.datasets import PennTreebank
+
+def get_inference_data():
+    test_dp = PennTreebank(split='test')
+    # TODO prepare this data properly 
+
+    return test_dp
+
+def evaluate(test_dp, model):
+    """
+    evaluation loop for xlmr
+    """
+    model.eval() 
+    total_correct, total_count = 0.0, 0.0
+    with torch.no_grad():
+        for batch in test_dp:
+            print(batch)
+            model_input = batch["pad_token_ids"] # TODO .to(device). same for next line
+            target = torch.tensor(batch["labels"])
+            logits = model(model_input)
+            correct = (logits.argmax(1) == target).sum()
+            total_correct+=float(correct)
+            total_count+=float(target.size(0))
+    return total_correct/total_count
+
+def train(self, niter=1):
+    # TODO need the right loss, correct optimizer/learning rate, etc
+    pass
\ No newline at end of file
diff --git a/docs/DLRM.md b/docs/DLRM.md
new file mode 100644
index 0000000..d95acf4
--- /dev/null
+++ b/docs/DLRM.md
@@ -0,0 +1,109 @@
+# Deep Learning Recommendation Model for Personalization and Recommendation Systems:
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+## Summary
+Deep Learning Recommendation Model (DLRM) supports various flags to control the model characteristics and execution sizes. This document introduces a bash script to toggle between the configurations used for benchmarking.
+
+## Getting Started with DLRM
+Here is an example initial run. Run the following commands in terminal.
+
+Starting from the top level of the repo,
+```
+cd benchmarks
+```
+Now we are at proxyworkloads/benchmarks
+
+Run one of the DLRM benchmarks. This script will log to the 
+directory using the -l flag. Here, log to results/.
+```
+./run_dlrm_ootb_train.sh -l results
+```
+
+Create summary table and save to results/summary.txt
+```
+python ../fb5logging/result_summarizer.py -f results 
+```
+
+See and/or run proxyworkloads/benchmarks/run_all.sh for a runnable example. Please note that to run it, your current dir must be at proxyworkloads/benchmarks.
+
+### Additional DLRM Configurations
+You may choose to run your own model configuration. To do so, create a config file containing all flags for `dlrm_s_pytorch.py` on a single line. For example, create a file called `dlrm_tutorial` with contents:
+
+```
+--mini-batch-size=64 --test-mini-batch-size=64 --test-num-workers=0 --num-batches=1000 --data-generation=random --arch-mlp-bot=512-512-64 --arch-mlp-top=1024-1024-1024-1 --arch-sparse-feature-size=64 --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup=100 --arch-interaction-op=dot --numpy-rand-seed=727 --print-freq=100 --print-time
+```
+
+Run the `run_dlrm_ootb_train.sh` script with the `-c` flag to specify which config file to use:
+
+```
+./run_dlrm_ootb_train.sh -l results -c dlrm_tutorial
+```
+
+In this example, you should see an output similar to this:
+
+```
+$ ./run_dlrm_ootb_train.sh -l results -c dlrm_tutorial
+=== Launching FB5 ===
+Benchmark: dlrm
+Implementation: ootb
+Mode: train
+Config: dlrm_tutorial
+Saving FB5 Logger File: results/dlrm_ootb_train_dlrm_tutorial.log
+
+Running Command:
++ python dlrm/ootb/dlrm_s_pytorch.py --mini-batch-size=64 --test-mini-batch-size=64 --test-num-workers=0 --num-batches=1000 --data-generation=random --arch-mlp-bot=512-512-64 --arch-mlp-top=1024-1024-1024-1 --arch-sparse-feature-size=64 --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup=100 --arch-interaction-op=dot --numpy-rand-seed=727 --print-freq=100 --print-time
+world size: 1, current rank: 0, local rank: 0
+Using CPU...
+time/loss/accuracy (if enabled):
+Finished training it 100/1000 of epoch 0, 56.60 ms/it, loss 0.084849
+Finished training it 200/1000 of epoch 0, 44.95 ms/it, loss 0.082306
+Finished training it 300/1000 of epoch 0, 45.26 ms/it, loss 0.083103
+Finished training it 400/1000 of epoch 0, 47.32 ms/it, loss 0.080760
+Finished training it 500/1000 of epoch 0, 46.90 ms/it, loss 0.084727
+Finished training it 600/1000 of epoch 0, 45.55 ms/it, loss 0.083395
+Finished training it 700/1000 of epoch 0, 47.67 ms/it, loss 0.084470
+Finished training it 800/1000 of epoch 0, 44.90 ms/it, loss 0.083775
+Finished training it 900/1000 of epoch 0, 46.24 ms/it, loss 0.082480
+Finished training it 1000/1000 of epoch 0, 46.44 ms/it, loss 0.082861
+=== Completed Run ===
+```
+
+### Inference
+You may also choose to run DLRM in Inference mode. To do so, follow the same steps as above using the `run_dlrm_ootb_infer.sh` script instead.
+
+## Requirements
+pytorch-nightly
+
+scikit-learn
+
+numpy
+
+## Optional
+### fbgemm_gpu
+Install additional requirements:
+```
+conda install jinja2
+conda install nvidiacub
+```
+Set export paths:
+```
+export CUDACXX=/usr/local/cuda/bin/nvcc
+export CUB_DIR=${CUB_DIR}
+```
+Clone repo:
+```
+git clone https://github.com/pytorch/FBGEMM.git
+cd FBGEMM/fbgemm_gpu
+git submodule sync
+git submodule update --init --recursive    
+```
+Run installer:
+```
+python setup.py build develop
+```
+Copy shared object file
+```
+cp fbgemm_gpu_py.so /<proxyworkloads root>/benchmarks
+```
+Enable fbgemm_gpu by adding command line argument: --use-fbgemm-gpu
diff --git a/docs/RNNT.md b/docs/RNNT.md
new file mode 100644
index 0000000..a4ee672
--- /dev/null
+++ b/docs/RNNT.md
@@ -0,0 +1,139 @@
+# RNN-T from MLCommons w/ LibriSpeech Dataset
+
+## Requirements
+
+ - CUDA 11.0
+ - Microconda
+ - GPU device compatible with CUDA 11.0
+
+## Set-up
+
+There are two options for set-up, either using the setup_rnnt.sh script or manually installing packages below.
+
+## Automatic Script
+```
+cd benchmarks
+bash setup_rnnt.sh
+```
+## Manual Set-up
+
+## Training
+
+This document provides the detailed instructions to start training RNN-T models with Open-Source LibriSpeech Dataset. The repository can be found here: https://github.com/mlcommons/training/tree/master/rnn_speech_recognition/pytorch
+
+### Setting up the conda environment
+```
+conda create -n proxy-rnnt python=3.8.3
+conda activate proxy-rnnt
+pip install requests bs4 argparse
+conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=11.0 -c pytorch
+```
+
+### Switch to CUDA 11.0
+```
+git clone https://github.com/phohenecker/switch-cuda.git
+source ~/cluster/work/switch-cuda/switch-cuda.sh 11.0
+export TORCH_CUDA_ARCH_LIST=8.0
+```
+
+### Getting LibriSpeech ready
+```
+# Install required packages
+sudo apt-get install sox libsndfile1 jq numactl git cmake
+pip install unidecode==1.1.1 inflect==4.1.0 pandas==1.1.5 sentencepiece==0.1.94 librosa==0.8.0 soundfile==0.10.3.post1 tensorboard==2.3.0 numba==0.48.0
+
+# Set-up directories and exports
+# Pick a mounted location that can hold up to 500GB of dataset data
+export DATASET_DIR=<your path>/rnnt/datasets
+export RESULT_DIR=<your path>/rnnt/results
+
+# Download and Extract LibriSpeech Dataset
+bash rnnt/ootb/train/scripts/download_librispeech.sh
+
+# Process the .flac files into .wav and .json
+bash rnnt/ootb/train/scripts/preprocess_librispeech.sh
+```
+
+### Getting Training running:
+```
+# Install dllogger and mlcommons logger
+pip install https://github.com/NVIDIA/dllogger/archive/26a0f8f1958de2c0c460925ff6102a4d2486d6cc.zip
+pip install https://github.com/mlcommons/logging/archive/d08740cadb4188a5ebeb84ad6c68f98c1e129805.zip
+
+# Install Nvidia Dali
+pip install --no-cache --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda110==0.28.0
+
+# Install Warp-Transducer library
+git clone https://github.com/HawkAaron/warp-transducer deps/warp-transducer
+cd deps/warp-transducer/
+git checkout f546575109111c455354861a0567c8aa794208a2
+sed -i 's/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")/#set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")/g' CMakeLists.txt
+sed -i 's/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")/set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")/g' CMakeLists.txt
+mkdir build
+cd build/
+cmake ..
+make -j32
+export WARP_RNNT_PATH=`pwd`
+export CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME
+export LD_LIBRARY_PATH="$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+export LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+export CFLAGS="-I$CUDA_HOME/include $CFLAGS"
+cd ../pytorch_binding
+python3 setup.py install
+rm -rf ../tests test ../tensorflow_binding
+cd ../../..
+
+# Install Nvidia CuDNN
+conda install -c nvidia cudnn==8.0.4
+
+# Install apex
+pip install --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/8a1ed9e8d35dfad26fb973996319965e4224dcdd.zip
+
+# Other deps
+pip install pyyaml
+```
+
+### Changes to Source Code:
+
+* Change configs/baseline_v3-1024sp.yaml’s tokenizer: sentpiece_model: to your $DATASET_DIR/sentencepieces/librispeech1023.model
+
+### Finally train with command:
+```
+bash rnnt/ootb/train/scripts/train.sh $DATASET_DIR/LibriSpeech rnnt/ootb/train/configs/baseline_v3-1023sp.yaml $RESULT_DIR
+```
+At this point, you should be able to see training epochs.
+
+## Inference
+
+This document provides the detailed instructions to run inference on a pre-trained model from MLCommons against the Open-Source LibriSpeech dataset. The repository can be found here: https://github.com/mlcommons/inference/tree/master/speech_recognition/rnnt
+
+### Installing dependencies
+
+Using the same conda environment as training:
+```
+conda activate proxy-rnnt
+```
+
+Install MLPerf loadgen and additional packages:
+```
+# Install MLPerf loadgen
+pushd inference/loadgen
+python setup.py install
+popd
+
+# Install dependencies
+pip install toml==0.10.0
+pip install tqdm==4.31.1
+```
+
+Download the pre-trained model:
+```
+wget https://zenodo.org/record/3662521/files/DistributedDataParallel_1576581068.9962234-epoch-100.pt?download=1 -O $RESULT_DIR/rnnt.pt
+```
+
+### Finally run inference with the command:
+```
+python rnnt/ootb/inference/run.py --backend pytorch --dataset_dir $DATASET_DIR/LibriSpeech --manifest $DATASET_DIR/LibriSpeech/librispeech-dev-clean-wav.json --pytorch_config_toml rnnt/ootb/inference/pytorch/configs/rnnt.toml --pytorch_checkpoint $RESULT_DIR/rnnt.pt --scenario Offline --log_dir $RESULT_DIR/Offline_pytorch_rerun
+```
+At this point, wait for inference to finish and produce results.
diff --git a/docs/adding_benchmarks.md b/docs/adding_benchmarks.md
new file mode 100644
index 0000000..27004e3
--- /dev/null
+++ b/docs/adding_benchmarks.md
@@ -0,0 +1,6 @@
+# Adding Benchmarks
+
+The key principles for adding benchmarks:
+1. Each benchmark can be run indepdently
+2. Do not assume because two benchmarks share a depdency that they will forever in the future
+3. Each benchmark is a separate directory
\ No newline at end of file
diff --git a/docs/getting_started.md b/docs/getting_started.md
new file mode 100644
index 0000000..0823e72
--- /dev/null
+++ b/docs/getting_started.md
@@ -0,0 +1,21 @@
+## Getting Started: DLRM Example
+Here is an example initial run. Run the following commands in terminal.
+
+Starting from the top level of the repo,
+```
+cd benchmarks
+```
+Now we are at proxyworkloads/benchmarks
+
+Run one of the DLRM benchmarks. This script will log to the 
+directory using the -l flag. Here, log to results/.
+```
+./run_dlrm_ootb_train.sh -l results
+```
+
+Create summary table and save to results/summary.txt
+```
+python ../fb5logging/result_summarizer.py -f results 
+```
+
+See and/or run proxyworkloads/benchmarks/run_all.sh for a runnable example. Please note that to run it, your current dir must be at proxyworkloads/benchmarks.
\ No newline at end of file
diff --git a/fb5logging/fb5logger.py b/fb5logging/fb5logger.py
new file mode 100644
index 0000000..c2a8aec
--- /dev/null
+++ b/fb5logging/fb5logger.py
@@ -0,0 +1,93 @@
+import logging
+import os
+import sys
+import json
+import time
+import loggerconstants as constants
+
+# TODO: change name to FAMLogger
+class FB5Logger():
+
+    def __init__(self, log_file_path):
+        # Create the directory if it doesn't exist.
+        log_file_dir = os.path.dirname(log_file_path)
+        if not os.path.exists(log_file_dir):
+            os.makedirs(log_file_dir)
+
+        open(log_file_path, 'w') # create or overwrite file
+        self.log_file_path = log_file_path
+
+    def _dump_json(self, d: dict):
+        with open(self.log_file_path, 'a') as f:
+            json.dump(d, f)
+            f.write('\n')
+
+    def _time_ms(self):
+        """
+        Naive implementation of current time.
+        """
+        return time.time_ns() * 1e-6
+
+    def log_line(self, log_info : dict, key : str):
+        """
+        Log a line with a dict of arbitrary form for the data and a string key. 
+        """
+        log_info['key'] = key
+        self._dump_json(log_info)
+
+    def header(self, benchmark_name, implementation_name, mode, config_name, score_metric=constants.EXPS):
+        """
+        Required for every log. Describes what the benchmark is. 
+        """
+        header_dict = {
+            "benchmark": benchmark_name, 
+            "implementation": implementation_name, 
+            "mode": mode, 
+            "config": config_name,
+            "score_metric": score_metric}
+        self.log_line(header_dict, constants.HEADER)
+
+    def run_start(self, time_ms = None):
+        """
+        Records start of logging.
+        """
+        if(time_ms is None):
+            time_ms = self._time_ms()
+        start_dict = {"time_ms": time_ms}
+        self.log_line(start_dict, constants.RUN_START)
+
+    # TODO: remove batch info args and migrate to record_batch_info
+    def run_stop(self, num_batches, batch_size, extra_metadata = None, time_ms = None):
+        """
+        Records end of logging and any required data. 
+        """
+        if(time_ms is None):
+            time_ms = self._time_ms()
+        stop_dict = {"time_ms": time_ms, "num_batches": num_batches, "batch_size": batch_size}
+        if extra_metadata is not None:
+            stop_dict["extra_metadata"] = extra_metadata
+        self.log_line(stop_dict, constants.RUN_STOP)
+
+    def record_batch_info(self, num_batches = None, batch_size = None):
+        batch_size_dict = {"batch_size": batch_size}
+        self.log_line(batch_size_dict, constants.BATCH_SIZE)
+        nbatches_dict = {"num_batches": num_batches}
+        self.log_line(nbatches_dict, constants.NUM_BATCHES)
+    
+    def batch_start(self, time_ms = None):
+        """
+        Marks beginning of the model processing a batch
+        """
+        if(time_ms is None):
+            time_ms = self._time_ms()
+        batch_start_dict = {"time_ms": time_ms}
+        self.log_line(batch_start_dict, constants.BATCH_START)
+
+    def batch_stop(self, time_ms = None, batch_size = None):
+        """
+        Marks end of the model processing a batch
+        """
+        if(time_ms is None):
+            time_ms = self._time_ms()
+        batch_stop_dict = {"time_ms": time_ms, "batch_size": batch_size}
+        self.log_line(batch_stop_dict, constants.BATCH_STOP)
diff --git a/fb5logging/loggerconstants.py b/fb5logging/loggerconstants.py
new file mode 100644
index 0000000..d39ccad
--- /dev/null
+++ b/fb5logging/loggerconstants.py
@@ -0,0 +1,32 @@
+"""
+Master list of constants for logger
+Mostly logger keys, but some other constants as well.
+"""
+
+# loggerkey - header
+HEADER = "header"
+
+# loggerkey - timing info
+EPOCH_START = "epoch_start"
+EPOCH_STOP = "epoch_stop"
+RUN_START = "run_start"
+RUN_STOP = "run_stop"
+BATCH_START = "batch_start"
+BATCH_STOP = "batch_stop"
+
+# loggerkey - run information
+NUM_BATCHES = "num_batches"
+BATCH_SIZE = "batch_size"
+FLOPS = "flops"
+
+# loggerkey - model hyperparameters
+LEARNING_RATE = "learning_rate"
+
+# type of summary view saved to file
+INTERMEDIATE_VIEW = "intermediate_view" # table view
+RAW_VIEW = "raw_view" # json view
+
+# available types of score metrics
+EXPS = "exps" # examples/sec (throughput)
+TFPS = "tfps" # teraflops/sec (floating point ops rate)
+GBPS = "gbps" # gb/sec 
diff --git a/fb5logging/result_summarizer.py b/fb5logging/result_summarizer.py
new file mode 100644
index 0000000..5897202
--- /dev/null
+++ b/fb5logging/result_summarizer.py
@@ -0,0 +1,249 @@
+'''
+Given a folder of .log files, outputs a summarization file of those logs in table form.
+'''
+
+import argparse
+import json
+import os
+import re
+import sys
+import glob
+import math
+import loggerconstants as constants
+
+## Utility
+def _flatten_dict(d: dict):
+    """
+    Flattens a nested dictionary, not in-place
+    """
+    res = {}
+    for key, val in d.items():
+        if isinstance(val, dict):
+            res.update(_flatten_dict(val))
+        else:
+            res[key] = val
+    return res
+
+def _dump_json(d: dict, file_path: str):
+    with open(file_path, 'a') as f:
+        json.dump(d, f)
+        f.write('\n')
+
+def _lst_to_file(lst: list, file_path: str):
+    for i in range(len(lst)):
+        lst[i] = str(lst[i])
+    delimiter = ' ' #space delimiter
+    with open(file_path, 'a') as f:
+        f.write(delimiter.join(lst) + '\n')
+
+def _find_and_read_row_multiple(log_str : str, key : str):
+    """
+    Finds multiple rows in a log file string and converts it into list of dicts.
+    Gives in order of how it appears in the document.
+    """
+    regex = r'.*"key": "{}".*'.format(key)
+    row_lst = re.findall(regex, log_str)
+    for i, row in enumerate(row_lst):
+        row_lst[i] = json.loads(row)
+        row_lst[i].pop('key')
+    return row_lst
+
+def _find_and_read_row(log_str : str, key : str, row_must_exist=True):
+    """
+    Finds first matching row in a log file string and converts it into a dict.
+    """
+    regex = r'.*"key": "{}".*'.format(key)
+    row = re.search(regex, log_str)
+    if row is None:
+        if(row_must_exist):
+            raise Exception('Failed to match regex: '.format(regex))
+        else:
+            return None
+    row = json.loads(row.group(0))
+    row.pop('key')
+    return row
+
+## Metrics
+
+def get_exps_metric(log_str : str):
+    """
+    Given log file in form of loaded in-memory string, calculate
+    queries/second
+    """
+    run_start_row = _find_and_read_row(log_str, constants.RUN_START)
+    run_stop_row = _find_and_read_row(log_str, constants.RUN_STOP)
+
+    # calculate runtime
+    run_start_time = float(run_start_row['time_ms'])
+    run_stop_time = float(run_stop_row['time_ms'])
+    seconds_runtime = (run_stop_time - run_start_time) / 1000
+
+    # get num batches and batch size based on available log info
+    # batch info in run_stop row to be deprecated
+    batch_size_row = _find_and_read_row(log_str, constants.BATCH_SIZE, row_must_exist=False)
+    num_batches_row = _find_and_read_row(log_str, constants.NUM_BATCHES, row_must_exist=False)
+    if(num_batches_row is not None and batch_size_row is not None):
+        num_batches, batch_size = num_batches_row['num_batches'], batch_size_row['batch_size']
+    else:
+        num_batches, batch_size = run_stop_row['num_batches'], run_stop_row['batch_size']
+
+    # calculate throughput
+    if(seconds_runtime == 0):
+        throughput = 'error: runtime is zero'
+    else:
+        throughput = num_batches * batch_size / seconds_runtime 
+    average_batch_time = seconds_runtime / num_batches
+
+    metrics_dict = {'score': throughput, 'units': "ex/s"}
+    return metrics_dict
+
+def get_tfps_metric(log_str):
+    """
+    Given log file in form of loaded in-memory string, calculate
+    teraflops/second 
+    """
+    run_stop_row = _find_and_read_row(log_str, constants.RUN_STOP)
+    tfps = run_stop_row['extra_metadata']['TF/s']
+    metrics_dict = {'score': tfps, 'units': "TF/s"}
+    return metrics_dict
+
+def get_gbps_metric(log_str):
+    """
+    Given log file in form of loaded in-memory string, calculate
+    teraflops/second 
+    """
+    run_stop_row = _find_and_read_row(log_str, constants.RUN_STOP)
+    gbps = run_stop_row['extra_metadata']['GB/s']
+    metrics_dict = {'score': gbps, 'units': "GB/s"}
+    return metrics_dict
+
+def _calculate_metrics(log_str : str, score_metric : str):
+    """
+    Calculates metrics. Routes to different metrics functions based on the score_metric type. 
+    Allowed score metrics live in loggerconstants.py
+    """
+    
+    # route to correct score_metric, which gets score and units
+    if(score_metric == constants.EXPS):
+        metrics_dict = get_exps_metric(log_str)
+    elif(score_metric == constants.TFPS):
+        metrics_dict = get_tfps_metric(log_str)
+    elif(score_metric == constants.GBPS):
+        metrics_dict = get_gbps_metric(log_str)
+    else:
+        raise Exception("Score metric not available - should never get here")
+    return metrics_dict
+
+## Handle batches and batch latency 
+
+def _calculate_batch_latency(log_str : str, percentile : float):
+    """
+    Calculates batch latency at a given percentile in range [0, 1]. 
+    """
+    assert 0 <= percentile <= 1
+
+    batch_start_lst = _find_and_read_row_multiple(log_str, constants.BATCH_START)
+    batch_stop_lst = _find_and_read_row_multiple(log_str, constants.BATCH_STOP)
+    if(len(batch_start_lst) != len(batch_stop_lst)):
+        raise Exception('Number of batch starts does not match number of batch stops')
+    nbatches = len(batch_start_lst)
+    if(nbatches == 0):
+        return None
+
+    batch_times = []
+    for i in range(nbatches):
+        # calculate runtime
+        batch_start_time = float(batch_start_lst[i]['time_ms'])
+        batch_stop_time = float(batch_stop_lst[i]['time_ms'])
+        batch_runtime = (batch_stop_time - batch_start_time) / 1000 # seconds
+        batch_times.append(batch_runtime)
+    
+    batch_times.sort()
+    # default to slower latency if percentile doesn't exactly match a batch time
+    batch_idx = math.ceil(percentile * (nbatches - 1)) 
+    batch_time_at_percentile = batch_times[batch_idx]
+
+    return batch_time_at_percentile
+
+## Read and process log files 
+
+def _create_summary_row(file_path : str):
+    """
+    Takes a single file path.
+    Return JSON row.
+    """
+    with open(file_path, 'r') as f:
+        log_file_str = f.read()
+    header = _find_and_read_row(log_file_str, constants.HEADER) 
+    metrics = _calculate_metrics(log_file_str, header['score_metric'])
+    row = header
+    row['metrics'] = metrics
+    
+    # TODO: fix units and percentile for latency
+    batch_latency = _calculate_batch_latency(log_file_str, 0.95)
+    row['batch_latency_95_sec'] = batch_latency 
+
+    # TODO: allow encoding of extra metadata 
+
+    return row
+
+def _rows_to_file(rows: list, folder_path: str, summary_view=constants.INTERMEDIATE_VIEW):
+    """
+    Save list of summary rows into a human-readable table in a file.
+    rows: list[dict]
+    """
+    file_path = folder_path + '/summary.txt'
+    if(len(rows) == 0):
+        print('Nothing to summarize, no changes to summary file.')
+        return
+    open(file_path, 'w') # create or overwrite file
+
+    if(summary_view == constants.INTERMEDIATE_VIEW):
+        top_level_keys = [
+            "benchmark",
+            "implementation",
+            "mode",
+            "config",
+            "score",
+            "units",
+            "batch_latency_95_sec"]
+        _lst_to_file(top_level_keys, file_path)
+        for row in rows:
+            flattened_row = _flatten_dict(row)
+            top_val_lst = [flattened_row[k] for k in top_level_keys]
+            _lst_to_file(top_val_lst, file_path)
+    elif(summary_view == constants.RAW_VIEW):
+        for row in rows:
+            _dump_json(row, file_path)
+    else:
+        print('Summary view of wrong type - should never get here.')
+
+def summarize_results(benchmark_folder) -> list:
+    """
+    Summarizes a set of results.
+    returns: list[dict]
+    """
+    rows = []
+    pattern = '{folder}/*.log'.format(folder=benchmark_folder) # TODO allow other kinds of files
+    result_files = glob.glob(pattern, recursive=True)
+    print('Summarizing files: {}'.format(result_files))
+    for file_path in result_files:
+        row = _create_summary_row(file_path)
+        rows.append(row)
+    return rows
+
+## Parse and main
+
+def init_argparse() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Summarize a folder of logged benchmark result files."
+    )
+    parser.add_argument('-f', '--benchmark-folder', type=str, default='.')
+    parser.add_argument('-v', '--summary-view', type=str, default=constants.INTERMEDIATE_VIEW)
+    return parser
+
+if __name__ == '__main__':
+    parser = init_argparse()
+    args = parser.parse_args()
+    rows = summarize_results(args.benchmark_folder)
+    _rows_to_file(rows, args.benchmark_folder, summary_view=args.summary_view)
diff --git a/fb5logging/test_mllog.py b/fb5logging/test_mllog.py
new file mode 100644
index 0000000..3eff2f4
--- /dev/null
+++ b/fb5logging/test_mllog.py
@@ -0,0 +1,16 @@
+from fb5logger import FB5Logger
+import time
+
+
+def dummy_example():
+  """Example usage of fb5logger"""
+
+  logger = FB5Logger("results/example_simple.log") # file to write to. works only with .log
+  logger.header("DLRM", "OOTB", "train", "small") # benchmark, implementation, mode, config
+
+  logger.run_start() 
+  time.sleep(1) # whatever benchmark here. 
+  logger.run_stop(100, 32) # num_batches, batch_size
+
+if __name__ == "__main__":
+  dummy_example()
\ No newline at end of file
diff --git a/param b/param
new file mode 160000
index 0000000..90bf767
--- /dev/null
+++ b/param
@@ -0,0 +1 @@
+Subproject commit 90bf7676a1a6e9f19d93bf2c123fb8447bf25222