triton-inference-server · rmccorm4 · Jul 26, 2023 · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
@@ -94,7 +94,7 @@ struct TRITONBACKEND_Batcher;
 ///   }
 ///
 #define TRITONBACKEND_API_VERSION_MAJOR 1
-#define TRITONBACKEND_API_VERSION_MINOR 13
+#define TRITONBACKEND_API_VERSION_MINOR 14
 
 /// Get the TRITONBACKEND API version supported by Triton. This value
 /// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
@@ -1480,6 +1480,26 @@ TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
     const TRITONSERVER_InstanceGroupKind kind, const uint64_t count,
     const uint64_t* device_ids, const uint64_t id_count);
 
+/// Sets whether or not the backend supports concurrently loading multiple
+/// TRITONBACKEND_ModelInstances in a thread-safe manner.
+///
+/// Most backends are thread-safe for parallel execution of model instances as
+/// that is the primary use of concurrency in backends. However, not all
+/// backends are thread-safe when initializing or finalizing model instances. In
+/// order for Triton to know that it can safely load instances concurrently, the
+/// backend needs to opt-in by setting this backend attribute to true. By
+/// default, this attribute is false and calls to the
+/// TRITONBACKEND_ModelInstanceInitialize function will be made serially. If
+/// this attribute is set to true, then Triton will make calls to
+/// TRITONBACKEND_ModelInstanceInitialize concurrently.
+///
+/// \param backend_attributes The backend attributes object.
+/// \param enabled Whether or not the backend supports loading model instances
+/// in parallel.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendAttributeSetParallelInstanceLoading(
+    TRITONBACKEND_BackendAttribute* backend_attributes, bool enabled);
+
 /// TRITONBACKEND Batching
 ///
 /// API to add custom batching strategy

diff --git a/src/backend_manager.cc b/src/backend_manager.cc
@@ -120,6 +120,7 @@ TritonBackend::UpdateAttributes()
   if (!latest.preferred_groups_.empty()) {
     attributes_.preferred_groups_ = latest.preferred_groups_;
   }
+  attributes_.parallel_instance_loading_ = latest.parallel_instance_loading_;
   return Status::Success;
 }
 

diff --git a/src/backend_manager.h b/src/backend_manager.h
@@ -47,9 +47,15 @@ namespace triton { namespace core {
 class TritonBackend {
  public:
   struct Attribute {
-    Attribute() : exec_policy_(TRITONBACKEND_EXECUTION_BLOCKING) {}
+    Attribute()
+        : exec_policy_(TRITONBACKEND_EXECUTION_BLOCKING),
+          parallel_instance_loading_(false)
+    {
+    }
     TRITONBACKEND_ExecutionPolicy exec_policy_;
     std::vector<inference::ModelInstanceGroup> preferred_groups_;
+    // Whether the backend supports loading model instances in parallel
+    bool parallel_instance_loading_;
   };
   typedef TRITONSERVER_Error* (*TritonModelInitFn_t)(
       TRITONBACKEND_Model* model);

diff --git a/src/backend_model.cc b/src/backend_model.cc
@@ -1531,6 +1531,16 @@ TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
   return nullptr;
 }
 
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendAttributeSetParallelInstanceLoading(
+    TRITONBACKEND_BackendAttribute* backend_attributes, bool enabled)
+{
+  auto ba = reinterpret_cast<TritonBackend::Attribute*>(backend_attributes);
+  ba->parallel_instance_loading_ = enabled;
+  return nullptr;
+}
+
 }  // extern C
 
 }}  // namespace triton::core
diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
@@ -26,6 +26,8 @@
 
 #include "backend_model_instance.h"
 
+#include <future>
+
 #ifndef _WIN32
 #include <sys/resource.h>
 #include <sys/syscall.h>
@@ -204,6 +206,17 @@ TritonModelInstance::SetInstances(
       existing_instances = model->IndexInstances();
 
   static triton::common::HostPolicyCmdlineConfig empty_host_policy;
+  std::vector<std::future<Status>> creation_results;
+
+  // Deferred will be lazily evaluated when the result is requested. Since the
+  // creation_results are requested serially below, this is equivalent to making
+  // the calls serially.
+  auto launch_policy = std::launch::deferred;
+  // If the backend supports it, std::launch::async will allow the calls to be
+  // made concurrently.
+  if (model->Backend()->BackendAttributes().parallel_instance_loading_) {
+    launch_policy = std::launch::async;
+  }
 
   for (const auto& group : model_config.instance_group()) {
     std::vector<std::string> profile_names;
@@ -252,6 +265,7 @@ TritonModelInstance::SetInstances(
       for (const auto& is : instance_setting) {
         const auto& kind = std::get<1>(is);
         const auto& id = std::get<2>(is);
+        const auto rate_limiter_config_ptr = std::get<3>(is);
 
         const Signature signature(group, id);
         // Check if an existing instance can be re-used.
@@ -271,53 +285,83 @@ TritonModelInstance::SetInstances(
 
         // No matching instance for re-using, so create the instance.
         const std::string& policy_name = std::get<0>(is);
-        const triton::common::HostPolicyCmdlineConfig* host_policy;
+        const triton::common::HostPolicyCmdlineConfig* host_policy =
+            &empty_host_policy;
         const auto policy_it = host_policy_map.find(policy_name);
         if (policy_it != host_policy_map.end()) {
           host_policy = &policy_it->second;
-        } else {
-          host_policy = &empty_host_policy;
         }
-        std::shared_ptr<TritonModelInstance> new_instance;
-        RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
-        auto err = CreateInstance(
-            model, instance_name, signature, kind, id, profile_names, passive,
-            policy_name, *host_policy, *(std::get<3>(is)), secondary_devices,
-            &new_instance);
-        RETURN_IF_ERROR(ResetNumaMemoryPolicy());
-        RETURN_IF_ERROR(err);
-        RETURN_IF_ERROR(
-            model->RegisterInstance(std::move(new_instance), passive));
-
-        // When deploying on GPU, we want to make sure the GPU memory usage
-        // is within allowed range, otherwise, stop the creation to ensure
-        // there is sufficient GPU memory for other use.
-        // We check the usage after loading the instance to better enforcing
-        // the limit. If we check before loading, we may create instance
-        // that occupies the rest of available memory which against the purpose
-        if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
-          size_t free, total;
-          double memory_limit;
-          RETURN_IF_ERROR(GetDeviceMemoryInfo(id, &free, &total));
-          RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
-              backend_cmdline_config_map, id, &memory_limit));
-          const size_t allow = total * memory_limit;
-          const size_t used = total - free;
-          if (used > allow) {
-            return Status(
-                Status::Code::UNAVAILABLE,
-                std::string("can not create model '") + instance_name +
-                    "': memory limit set for " +
-                    TRITONSERVER_InstanceGroupKindString(kind) + " " +
-                    std::to_string(id) +
-                    " has exceeded, model loading is rejected.");
-          }
+
+        // Note std::async can raise an exception on failure to start threads
+        try {
+          // Note that the local variables should be captured by value
+          creation_results.emplace_back(std::async(
+              launch_policy,
+              [host_policy, model, instance_name, signature, kind, id,
+               profile_names, passive, policy_name, rate_limiter_config_ptr,
+               secondary_devices, &backend_cmdline_config_map]() {
+                std::shared_ptr<TritonModelInstance> new_instance;
+                RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
+                // NOTE [thread-safety]: CreateInstance can modify bg_instances
+                // via SetBackendThread
+                auto err = CreateInstance(
+                    model, instance_name, signature, kind, id, profile_names,
+                    passive, policy_name, *host_policy,
+                    *rate_limiter_config_ptr, secondary_devices, &new_instance);
+                RETURN_IF_ERROR(ResetNumaMemoryPolicy());
+                RETURN_IF_ERROR(err);
+                // NOTE [thread-safety]: RegisterInstance modifies bg/bg_passive
+                // instances
+                RETURN_IF_ERROR(
+                    model->RegisterInstance(std::move(new_instance), passive));
+
+                // When deploying on GPU, we want to make sure the GPU memory
+                // usage is within allowed range, otherwise, stop the creation
+                // to ensure there is sufficient GPU memory for other use. We
+                // check the usage after loading the instance to better
+                // enforcing the limit. If we check before loading, we may
+                // create instance that occupies the rest of available memory
+                // which against the purpose
+                if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+                  size_t free, total;
+                  double memory_limit;
+                  RETURN_IF_ERROR(GetDeviceMemoryInfo(id, &free, &total));
+                  RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
+                      backend_cmdline_config_map, id, &memory_limit));
+                  const size_t allow = total * memory_limit;
+                  const size_t used = total - free;
+                  if (used > allow) {
+                    return Status(
+                        Status::Code::UNAVAILABLE,
+                        std::string("can not create model '") + instance_name +
+                            "': memory limit set for " +
+                            TRITONSERVER_InstanceGroupKindString(kind) + " " +
+                            std::to_string(id) +
+                            " has exceeded, model loading is rejected.");
+                  }
+                }
+
+                return Status::Success;
+              }));
+        }
+        catch (const std::exception& ex) {
+          return Status(
+              Status::Code::INTERNAL,
+              "ERROR: Failed to create instance: " + std::string(ex.what()));
         }
       }
     }
   }
 
-  return Status::Success;
+  auto res = Status::Success;
+  for (auto& cr : creation_results) {
+    auto lres = cr.get();
+    if (!lres.IsOk()) {
+      LOG_ERROR << "ERROR: Failed to create instance: " << lres.Message();
+      res = lres;
+    }
+  }
+  return res;
 }
 
 Status

diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc
@@ -1015,6 +1015,11 @@ TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup()
 {
 }
 
+TRITONAPI_DECLSPEC void
+TRITONBACKEND_BackendAttributeSetParallelInstanceLoading()
+{
+}
+
 TRITONAPI_DECLSPEC void
 TRITONCACHE_ApiVersion()
 {