Update main branch post-23.12 release (#121)

mc-nv · tanmayv25 · web-flow · commit 48e2e297834b · 2023-12-26T08:28:18.000-08:00
* Fix library list (#118) Remove nvfuser header Co-authored-by: Misha Chornyi <mchornyi@nvidia.com> * Remove nvfuser implementation (#119) --------- Co-authored-by: Tanmay Verma <tanmay2592@gmail.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -138,7 +138,6 @@ set(PT_LIBS
     "libtorch_cuda.so"
     "libtorch_cuda_linalg.so"
     "libtorch_global_deps.so"
-    "libnvfuser_codegen.so"
 )
 
 if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
@@ -214,7 +213,6 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libnvfuser_codegen.so libnvfuser_codegen.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so
diff --git a/README.md b/README.md
@@ -144,26 +144,6 @@ key: "INFERENCE_MODE"
 }
 ```
 
-* `ENABLE_NVFUSER`: Boolean flag to enable the NvFuser (CUDA Graph
-Fuser) optimization for TorchScript models. If not specified, the
-default PyTorch fuser is used. If `ENABLE_NVFUSER` is specified, the
-`ENABLE_TENSOR_FUSER` configuration (see below) is ignored.
-
-Please note that in some models generated using trace in old PyTorch versions might not work
-correctly with NvFuser. We recommend using scripting and a recent version of PyTorch
-to generate these models.
-
-The section of model config file specifying this parameter will look like:
-
-```
-parameters: {
-key: "ENABLE_NVFUSER"
-    value: {
-    string_value: "true"
-    }
-}
-```
-
 * `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to
 share weights. This optimization should not be used with stateful models. If not specified,
 weight sharing is disabled.
@@ -204,8 +184,6 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
     `ENABLE_JIT_PROFILING`
 
-    `ENABLE_TENSOR_FUSER`
-
 ### Support
 
 #### Model Instance Group Kind
diff --git a/src/libtorch.cc b/src/libtorch.cc
@@ -98,10 +98,6 @@ class ModelState : public BackendModel {
     return enable_jit_executor_pair_;
   }
   bool EnabledInferenceMode() { return enable_inference_mode_; }
-  const std::pair<bool, bool>& EnabledNvfuserPair() const
-  {
-    return enable_nvfuser_pair_;
-  }
   bool EnabledCacheCleaning() { return enable_cache_cleaning_; }
 
   bool EnabledWeightSharing() { return enable_weight_sharing_; }
@@ -132,16 +128,11 @@ class ModelState : public BackendModel {
 
   // Flag pairs to indicate if various JIT settings are set and
   // enabled respectively. Defaults to (false, true). Default behavior
-  // is to do nothing if not explicitly set. Tensor fuser flag is
-  // ignore if nvfuser is explicitly set.
+  // is to do nothing if not explicitly set.
   std::pair<bool, bool> enable_tensor_fuser_pair_;
   std::pair<bool, bool> enable_jit_profiling_pair_;
   std::pair<bool, bool> enable_jit_executor_pair_;
 
-  // Flag pair to indicate whether nvfuser is set and enabled respectively.
-  // Defaults to (false, false).
-  std::pair<bool, bool> enable_nvfuser_pair_;
-
   // Model mapping for shared TorchScript model across all instances on the
   // same device. The key is a pair of isGPU and device index.
   std::map<
@@ -233,8 +224,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
       enable_inference_mode_(true), enable_cache_cleaning_(false),
       enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
-      enable_jit_executor_pair_({false, true}),
-      enable_nvfuser_pair_({false, false})
+      enable_jit_executor_pair_({false, true})
 {
 }
 
@@ -475,29 +465,6 @@ ModelState::ParseParameters()
            " for model instance '" + Name() + "'")
               .c_str());
     }
-
-    // If 'ENABLE_NVFUSER' is not present in 'parameters' then no
-    // update is made to 'enable_nvfuser'.
-    bool enable_nvfuser = false;
-    err = ParseParameter(params, "ENABLE_NVFUSER", &enable_nvfuser);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_INFO, (std::string("NvFuser is not specified") +
-                                    " for model instance '" + Name() + "'")
-                                       .c_str());
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      enable_nvfuser_pair_ = {true, enable_nvfuser};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO, (std::string("NvFuser is ") +
-                                  (enable_nvfuser ? "enabled" : "disabled") +
-                                  " for model instance '" + Name() + "'")
-                                     .c_str());
-    }
   }
 
   return nullptr;
@@ -1552,34 +1519,13 @@ ModelInstanceState::Execute(
           std::get<1>(model_state_->EnabledJitExecutor());
     }
 
-    // Fuser. Parameter is ignored if NVFuser parameter is explicitly
-    // set (either enabled or disabled). No change is made unless
-    // fuser is explicitly set in parameters.
-    if (!std::get<0>(model_state_->EnabledNvfuserPair()) &&
-        std::get<0>(model_state_->EnabledTensorExprFuser())) {
+    // Fuser. No change is made unless fuser is explicitly set in
+    // parameters.
+    if (std::get<0>(model_state_->EnabledTensorExprFuser())) {
       torch::jit::setTensorExprFuserEnabled(
           std::get<1>(model_state_->EnabledTensorExprFuser()));
     }
 
-    // NV-Fuser. No change is made unless parameter is explicitly set.
-    if (std::get<0>(model_state_->EnabledNvfuserPair())) {
-      bool is_device_gpu =
-          (device_.is_cuda() ||
-           ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) &&
-            (device_cnt_ > 0)));
-      if (std::get<1>(model_state_->EnabledNvfuserPair()) && is_device_gpu) {
-        torch::jit::overrideCanFuseOnCPU(false);
-        torch::jit::overrideCanFuseOnGPU(false);
-        torch::jit::setTensorExprFuserEnabled(false);
-        torch::jit::fuser::cuda::setEnabled(true);
-      } else {
-        torch::jit::overrideCanFuseOnCPU(true);
-        torch::jit::overrideCanFuseOnGPU(true);
-        torch::jit::setTensorExprFuserEnabled(true);
-        torch::jit::fuser::cuda::setEnabled(false);
-      }
-    }
-
     torch::NoGradGuard no_grad;
 
     // If input is a dictionary, prepare dictionary from 'input_tensors'.
diff --git a/src/libtorch_utils.h b/src/libtorch_utils.h
@@ -35,7 +35,6 @@
 #pragma warning(push, 0)
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
-#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/script.h>  // One-stop header for TorchScript