[NupharEP] refine parallel schedule control (microsoft#2514)

* [NupharEP] Add parallel schedule to JIT function name Update Nuphar docker to use Python 3.6 and ubuntu 18.04 * Update notebook * Avoid JIT cache file name conflict
pranavm-nvidia · Dec 3, 2019 · c1be615 · c1be615
1 parent 784eca0
commit c1be615
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 31 deletions.
diff --git a/dockerfiles/Dockerfile.nuphar b/dockerfiles/Dockerfile.nuphar
@@ -3,9 +3,9 @@
 # Licensed under the MIT License.
 #--------------------------------------------------------------------------
 
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
-ARG PYTHON_VERSION=3.5
+ARG PYTHON_VERSION=3.6
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_SERVER_BRANCH=master
 

diff --git a/docs/python/notebooks/onnxruntime-nuphar-tutorial.ipynb b/docs/python/notebooks/onnxruntime-nuphar-tutorial.ipynb
@@ -25,7 +25,8 @@
     "3. Create and run inference on a model using ***LSTM***, run symbolic shape inference, edit LSTM ops to Scan, and check Nuphar speedup.\n",
     "4. ***Quantize*** the LSTM model and check speedup in Nuphar (CPU with AVX2 support is required).\n",
     "5. Working on real models from onnx model zoo: ***BERT squad*** and ***Bidirectional Attention Flow ([BiDAF](https://arxiv.org/pdf/1611.01603))***.\n",
-    "6. ***Ahead-Of-Time (AOT) compilation*** to save just-in-time compilation cost on model load.\n"
+    "6. ***Ahead-Of-Time (AOT) compilation*** to save just-in-time compilation cost on model load.\n",
+    "7. Performance tuning for single thread inference.\n"
    ]
   },
   {
@@ -173,11 +174,11 @@
     {
      "data": {
       "text/plain": [
-       "['produce node4 {\\n',\n",
-       " '  for (ax0, 0, seq) {\\n',\n",
-       " '    for (ax1, 0, batch) {\\n',\n",
-       " '      for (ax2.outer, 0, 64) {\\n',\n",
-       " '        node4[ramp((((((ax0*batch) + ax1)*64) + ax2.outer)*16), 1, 16)] = (input[ramp((((((ax0*batch) + ax1)*64) + ax2.outer)*16), 1, 16)] + (input[ramp((((((ax0*batch) + ax1)*64) + ax2.outer)*16), 1, 16)]*(input[ramp((((((ax0*batch) + ax1)*64) + ax2.outer)*16), 1, 16)] + (input[ramp((((((ax0*batch) + ax1)*64) + ax2.outer)*16), 1, 16)]*(input[ramp((((((ax0*batch) + ax1)*64) + ax2.outer)*16), 1, 16)] + input[ramp((((((ax0*batch) + ax1)*64) + ax2.outer)*16), 1, 16)])))))\\n',\n",
+       "['    for (ax2.outer, 0, 64) {\\n',\n",
+       " '      if ((0 <= (ax0.ax1.fused/batch))) {\\n',\n",
+       " '        if (((ax0.ax1.fused/batch) < seq)) {\\n',\n",
+       " '          node4[ramp((((ax0.ax1.fused*64) + ax2.outer)*16), 1, 16)] = (input[ramp((((ax0.ax1.fused*64) + ax2.outer)*16), 1, 16)] + (input[ramp((((ax0.ax1.fused*64) + ax2.outer)*16), 1, 16)]*(input[ramp((((ax0.ax1.fused*64) + ax2.outer)*16), 1, 16)] + (input[ramp((((ax0.ax1.fused*64) + ax2.outer)*16), 1, 16)]*(input[ramp((((ax0.ax1.fused*64) + ax2.outer)*16), 1, 16)] + input[ramp((((ax0.ax1.fused*64) + ax2.outer)*16), 1, 16)])))))\\n',\n",
+       " '        }\\n',\n",
        " '      }\\n',\n",
        " '    }\\n',\n",
        " '  }\\n',\n",
@@ -215,8 +216,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Fusion speed-up 261.36%\n",
-      "    Baseline: 0.722 s, Current: 0.200 s\n"
+      "Fusion speed-up 437.43%\n",
+      "    Baseline: 0.733 s, Current: 0.136 s\n"
      ]
     }
    ],
@@ -338,8 +339,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nuphar Scan speed-up 7.18%\n",
-      "    Baseline: 3.067 s, Current: 2.862 s\n"
+      "Nuphar Scan speed-up 1.97%\n",
+      "    Baseline: 3.062 s, Current: 3.003 s\n"
      ]
     }
    ],
@@ -443,8 +444,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Quantization speed-up 179.78%\n",
-      "    Baseline: 2.862 s, Current: 1.023 s\n"
+      "Quantization speed-up 196.18%\n",
+      "    Baseline: 3.003 s, Current: 1.014 s\n"
      ]
     }
    ],
@@ -574,8 +575,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nuphar BERT squad speed-up 31.65%\n",
-      "    Baseline: 4.844 s, Current: 3.679 s\n"
+      "Nuphar BERT squad speed-up 67.20%\n",
+      "    Baseline: 5.089 s, Current: 3.044 s\n"
      ]
     }
    ],
@@ -764,8 +765,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nuphar quantized BiDAF speed-up 26.28%\n",
-      "    Baseline: 0.318 s, Current: 0.252 s\n"
+      "Nuphar quantized BiDAF speed-up 44.03%\n",
+      "    Baseline: 0.304 s, Current: 0.211 s\n"
      ]
     }
    ],
@@ -794,7 +795,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 6. Ahead-Of-Time (AOT) compilation\n",
+    "## 6. Ahead-Of-Time (AOT) compilation\n",
     "Nuphar runs Just-in-time (JIT) compilation when loading models. The compilation may lead to slow cold start. We can use create_shared script to build dll from JIT code and accelerate model loading."
    ]
   },
@@ -806,7 +807,7 @@
     {
      "data": {
       "text/plain": [
-       "'JIT took 3.964 seconds'"
+       "'JIT took 4.612 seconds'"
       ]
      },
      "execution_count": 28,
@@ -886,8 +887,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "AOT speed-up 770.73%\n",
-      "    Baseline: 3.964 s, Current: 0.455 s\n"
+      "AOT speed-up 952.77%\n",
+      "    Baseline: 4.612 s, Current: 0.438 s\n"
      ]
     }
    ],
@@ -900,6 +901,59 @@
     "end_aot = timer()\n",
     "print_speedup('AOT', end_jit - start_jit, end_aot - start_aot)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Performance tuning for single thread inference.\n",
+    "By default, Nuphar enables parallel schedule for lower inference latency with multiple threads, when building with MKLML or OpenMP. For some models, user may want to run single-thread inference for better throughput with multiple concurrent inference threads, and turning off parallel schedule may make inference a bit faster in single thread."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Single thread perf w/o parallel schedule speed-up 3.80%\n",
+      "    Baseline: 0.318 s, Current: 0.306 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "# set OMP_NUM_THREADS to 1 for single thread inference\n",
+    "# this would mak\n",
+    "os.environ['OMP_NUM_THREADS'] = '1'\n",
+    "\n",
+    "sess = onnxruntime.InferenceSession(bidaf_converted)\n",
+    "start_baseline = timer()\n",
+    "for i in range(repeats):\n",
+    "    output_baseline = sess_baseline.run([], feed)\n",
+    "end_baseline = timer()\n",
+    "\n",
+    "# use NUPHAR_PARALLEL_MIN_WORKLOADS=0 to turn off parallel schedule, using settings string\n",
+    "# it can be set from environment variable too: os.environ['NUPHAR_PARALLEL_MIN_WORKLOADS'] = '0'\n",
+    "settings = 'nuphar_parallel_min_workloads:0'\n",
+    "onnxruntime.capi._pybind_state.set_nuphar_settings(settings)\n",
+    "sess = onnxruntime.InferenceSession(bidaf_converted)\n",
+    "\n",
+    "start = timer()\n",
+    "for i in range(repeats):\n",
+    "    output = sess_baseline.run([], feed)\n",
+    "end = timer()\n",
+    "print_speedup('Single thread perf w/o parallel schedule', end_baseline - start_baseline, end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -923,7 +977,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.9"
   },
   "msauthor": "ke.deng"
  },

diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc
@@ -142,8 +142,6 @@ tvm::runtime::PackedFunc LoadTVMPackedFuncFromCache(const std::string& func_name
   return func;
 }
 
-thread_local int saved_tvm_model_cnt = 0;
-
 void SaveTVMModuleToCache(const std::string& filename, tvm::runtime::Module& module) {
   fs::path path;
 
@@ -156,7 +154,7 @@ void SaveTVMModuleToCache(const std::string& filename, tvm::runtime::Module& mod
   if (existing_files.count(filename) == 0 &&
       GetOrCreateTVMModuleCacheDirectory(path, /*create*/ true)) {
     existing_files.insert(filename);
-    path.append("cached_" + std::to_string(saved_tvm_model_cnt++) + ".o");
+    path.append(filename + ".o");
     if (fs::exists(path)) {
       LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << "Object file " << path << " already exists, skip saving...";
       return;
@@ -165,9 +163,9 @@ void SaveTVMModuleToCache(const std::string& filename, tvm::runtime::Module& mod
   }
 }
 
-std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const CodeGenTarget& codegen_target) {
+std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const CodeGenTarget& codegen_target, int64_t parallel_min_workloads) {
   // in C, a function does not allow its name starting with a digit.
-  return NormalizeCppName("_" + subgraph.UniqueId() + " " + codegen_target.GetTargetName());
+  return NormalizeCppName("_" + subgraph.UniqueId() + "_" + codegen_target.GetTargetName() + "_p" + std::to_string(parallel_min_workloads));
 }
 
 bool TryCreateConstantScalar(

diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h
@@ -23,7 +23,7 @@ tvm::runtime::PackedFunc
 LoadTVMPackedFuncFromCache(const std::string& func_name);
 void SaveTVMModuleToCache(const std::string& filename, tvm::runtime::Module& module);
 
-std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const CodeGenTarget& codegen_target);
+std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const CodeGenTarget& codegen_target, int64_t parallel_min_workloads);
 
 bool TryCreateConstantScalar(tvm::Expr& scalar, const Tensor* tensor);
 }  // namespace nuphar

diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_compiler.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_compiler.cc
@@ -208,8 +208,9 @@ Status NupharCompiler::Lower(const nuphar::NupharSubgraphUnit& subgraph,
                              tvm::Target tvm_host_target,
                              NupharFuncInfo* func_info,
                              nuphar::OrtSubgraphAllocationInfo* partition_info) {
-  const auto& target_codegen = *context_.GetCodeGenHandle()->codegen_target;
-  std::string func_name = nuphar::GetPackedFuncName(subgraph, target_codegen);
+  const auto& codegen_handle = context_.GetCodeGenHandle();
+  const auto& target_codegen = *codegen_handle->codegen_target;
+  std::string func_name = nuphar::GetPackedFuncName(subgraph, target_codegen, codegen_handle->parallel_min_workloads);
   tvm::BuildConfig config = CreateConfig(*subgraph.nodes.front(),
                                          context_.GetCodeGenHandle()->allow_unaligned_buffers);