NVIDIA-NeMo
diff --git a/‎docs/user-guides/advanced/prompt-customization.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/user-guides/advanced/prompt-customization.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎nemoguardrails/actions/llm/generation.py‎
Lines changed: 70 additions & 74 deletions b/‎nemoguardrails/actions/llm/generation.py‎
Lines changed: 70 additions & 74 deletions
diff --git a/‎nemoguardrails/actions/llm/utils.py‎
Lines changed: 18 additions & 1 deletion b/‎nemoguardrails/actions/llm/utils.py‎
Lines changed: 18 additions & 1 deletion
@@ -55,6 +55,7 @@ To override the prompt for any other custom purpose, you can specify the `mode`
 As an example of this, let's consider the case of compacting. Some applications might need concise prompts, for instance to avoid handling long contexts, and lower latency at the risk of slightly degraded performance due to the smaller context. For this, you might want to have multiple versions of a prompt for the same task and same model. This can be achieved as follows:
 
 Task configuration:
+
 ```yaml
 models:
   - type: main
@@ -65,6 +66,7 @@ prompting_mode: "compact"  # Default value is "standard"
 ```
 
 Prompts configuration:
+
 ```yaml
 prompts:
   - task: generate_user_intent
@@ -117,6 +119,7 @@ prompts:
         content: ...
       # ...
 ```
+
 For each task, you can also specify the maximum length of the prompt to be used for the LLM call in terms of the number of characters. This is useful if you want to limit the number of tokens used by the LLM or when you want to make sure that the prompt length does not exceed the maximum context length. When the maximum length is exceeded, the prompt is truncated by removing older turns from the conversation history until length of the prompt is less than or equal to the maximum length. The default maximum length is 16000 characters.
 
 For example, for the `generate_user_intent` task, you can specify the following:
@@ -129,7 +132,6 @@ prompts:
     max_length: 3000
 ```
 
-
 ### Content Template
 
 The content for a completion prompt or the body for a message in a chat prompt is a string that can also include variables and potentially other types of constructs. NeMo Guardrails uses [Jinja2](https://jinja.palletsprojects.com/) as the templating engine. Check out the [Jinja Synopsis](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for a quick introduction.
@@ -200,7 +202,6 @@ Optionally, the output from the LLM can be parsed using an *output parser*. The
 - `bot_message`: parse the bot message, i.e., removes the "Bot message:" prefix if present;
 - `verbose_v1`: parse the output of the `verbose_v1` filter.
 
-
 ## Predefined Prompts
 
 Currently, the NeMo Guardrails toolkit includes prompts for `openai/gpt-3.5-turbo-instruct`, `openai/gpt-3.5-turbo`, `openai/gpt-4`, `databricks/dolly-v2-3b`, `cohere/command`, `cohere/command-light`, `cohere/command-light-nightly`.
@@ -232,8 +233,7 @@ prompt = llm_task_manager.render_task_prompt(
     },
 )
 
-with llm_params(llm, temperature=0.0):
-    check = await llm_call(llm, prompt)
+check = await llm_call(llm, prompt, llm_params={"temperature": 0.0})
 ...
 ```
 
 
@@ -436,8 +436,9 @@ async def generate_user_intent(
             llm_call_info_var.set(LLMCallInfo(task=Task.GENERATE_USER_INTENT.value))
 
             # We make this call with temperature 0 to have it as deterministic as possible.
-            with llm_params(llm, temperature=self.config.lowest_temperature):
-                result = await llm_call(llm, prompt)
+            result = await llm_call(
+                llm, prompt, llm_params={"temperature": self.config.lowest_temperature}
+            )
 
             # Parse the output using the associated parser
             result = self.llm_task_manager.parse_task_output(
@@ -518,17 +519,15 @@ async def generate_user_intent(
                     llm_call_info_var.set(LLMCallInfo(task=Task.GENERAL.value))
 
                     generation_options: GenerationOptions = generation_options_var.get()
-                    with llm_params(
+                    llm_params = (
+                        generation_options and generation_options.llm_params
+                    ) or {}
+                    text = await llm_call(
                         llm,
-                        **(
-                            (generation_options and generation_options.llm_params) or {}
-                        ),
-                    ):
-                        text = await llm_call(
-                            llm,
-                            prompt,
-                            custom_callback_handlers=[streaming_handler_var.get()],
-                        )
+                        prompt,
+                        custom_callback_handlers=[streaming_handler_var.get()],
+                        llm_params=llm_params,
+                    )
                     text = self.llm_task_manager.parse_task_output(
                         Task.GENERAL, output=text
                     )
@@ -558,16 +557,16 @@ async def generate_user_intent(
                 )
 
                 generation_options: GenerationOptions = generation_options_var.get()
-                with llm_params(
+                llm_params = (
+                    generation_options and generation_options.llm_params
+                ) or {}
+                result = await llm_call(
                     llm,
-                    **((generation_options and generation_options.llm_params) or {}),
-                ):
-                    result = await llm_call(
-                        llm,
-                        prompt,
-                        custom_callback_handlers=[streaming_handler_var.get()],
-                        stop=["User:"],
-                    )
+                    prompt,
+                    custom_callback_handlers=[streaming_handler_var.get()],
+                    stop=["User:"],
+                    llm_params=llm_params,
+                )
 
                 text = self.llm_task_manager.parse_task_output(
                     Task.GENERAL, output=result
@@ -662,8 +661,9 @@ async def generate_next_step(
             llm_call_info_var.set(LLMCallInfo(task=Task.GENERATE_NEXT_STEPS.value))
 
             # We use temperature 0 for next step prediction as well
-            with llm_params(llm, temperature=self.config.lowest_temperature):
-                result = await llm_call(llm, prompt)
+            result = await llm_call(
+                llm, prompt, llm_params={"temperature": self.config.lowest_temperature}
+            )
 
             # Parse the output using the associated parser
             result = self.llm_task_manager.parse_task_output(
@@ -924,23 +924,23 @@ async def generate_bot_message(
                         prompt = context.get("user_message")
 
                     generation_options: GenerationOptions = generation_options_var.get()
-                    with llm_params(
+                    llm_params = (
+                        generation_options and generation_options.llm_params
+                    ) or {}
+                    result = await llm_call(
                         llm,
-                        **(
-                            (generation_options and generation_options.llm_params) or {}
-                        ),
-                    ):
-                        result = await llm_call(
-                            llm, prompt, custom_callback_handlers=[streaming_handler]
-                        )
+                        prompt,
+                        custom_callback_handlers=[streaming_handler],
+                        llm_params=llm_params,
+                    )
 
-                        result = self.llm_task_manager.parse_task_output(
-                            Task.GENERAL, output=result
-                        )
+                    result = self.llm_task_manager.parse_task_output(
+                        Task.GENERAL, output=result
+                    )
 
-                        result = _process_parsed_output(
-                            result, self._include_reasoning_traces()
-                        )
+                    result = _process_parsed_output(
+                        result, self._include_reasoning_traces()
+                    )
 
                     log.info(
                         "--- :: LLM Bot Message Generation passthrough call took %.2f seconds",
@@ -987,13 +987,15 @@ async def generate_bot_message(
                 llm_call_info_var.set(LLMCallInfo(task=Task.GENERATE_BOT_MESSAGE.value))
 
                 generation_options: GenerationOptions = generation_options_var.get()
-                with llm_params(
+                llm_params = (
+                    generation_options and generation_options.llm_params
+                ) or {}
+                result = await llm_call(
                     llm,
-                    **((generation_options and generation_options.llm_params) or {}),
-                ):
-                    result = await llm_call(
-                        llm, prompt, custom_callback_handlers=[streaming_handler]
-                    )
+                    prompt,
+                    custom_callback_handlers=[streaming_handler],
+                    llm_params=llm_params,
+                )
 
                 log.info(
                     "--- :: LLM Bot Message Generation call took %.2f seconds",
@@ -1094,8 +1096,9 @@ async def generate_value(
         # Initialize the LLMCallInfo object
         llm_call_info_var.set(LLMCallInfo(task=Task.GENERATE_VALUE.value))
 
-        with llm_params(llm, temperature=self.config.lowest_temperature):
-            result = await llm_call(llm, prompt)
+        result = await llm_call(
+            llm, prompt, llm_params={"temperature": self.config.lowest_temperature}
+        )
 
         # Parse the output using the associated parser
         result = self.llm_task_manager.parse_task_output(
@@ -1269,32 +1272,28 @@ async def generate_intent_steps_message(
                 # We buffer the content, so we can get a chance to look at the
                 # first k lines.
                 await _streaming_handler.enable_buffering()
-                with llm_params(llm, temperature=self.config.lowest_temperature):
-                    asyncio.create_task(
-                        llm_call(
-                            llm,
-                            prompt,
-                            custom_callback_handlers=[_streaming_handler],
-                            stop=["\nuser ", "\nUser "],
-                        )
+                asyncio.create_task(
+                    llm_call(
+                        llm,
+                        prompt,
+                        custom_callback_handlers=[_streaming_handler],
+                        stop=["\nuser ", "\nUser "],
+                        llm_params={"temperature": self.config.lowest_temperature},
                     )
-                    result = await _streaming_handler.wait_top_k_nonempty_lines(k=2)
+                )
+                result = await _streaming_handler.wait_top_k_nonempty_lines(k=2)
 
-                    # We also mark that the message is still being generated
-                    # by a streaming handler.
-                    result += (
-                        f'\nBot message: "<<STREAMING[{_streaming_handler.uid}]>>"'
-                    )
+                # We also mark that the message is still being generated
+                # by a streaming handler.
+                result += f'\nBot message: "<<STREAMING[{_streaming_handler.uid}]>>"'
 
-                    # Moving forward we need to set the expected pattern to correctly
-                    # parse the message.
-                    # TODO: Figure out a more generic way to deal with this.
-                    if prompt_config.output_parser == "verbose_v1":
-                        _streaming_handler.set_pattern(
-                            prefix='Bot message: "', suffix='"'
-                        )
-                    else:
-                        _streaming_handler.set_pattern(prefix='  "', suffix='"')
+                # Moving forward we need to set the expected pattern to correctly
+                # parse the message.
+                # TODO: Figure out a more generic way to deal with this.
+                if prompt_config.output_parser == "verbose_v1":
+                    _streaming_handler.set_pattern(prefix='Bot message: "', suffix='"')
+                else:
+                    _streaming_handler.set_pattern(prefix='  "', suffix='"')
             else:
                 # Initialize the LLMCallInfo object
                 llm_call_info_var.set(
@@ -1306,8 +1305,7 @@ async def generate_intent_steps_message(
                     **((generation_options and generation_options.llm_params) or {}),
                     "temperature": self.config.lowest_temperature,
                 }
-                with llm_params(llm, **additional_params):
-                    result = await llm_call(llm, prompt)
+                result = await llm_call(llm, prompt, llm_params=additional_params)
 
             # Parse the output using the associated parser
             result = self.llm_task_manager.parse_task_output(
@@ -1388,10 +1386,8 @@ async def generate_intent_steps_message(
 
             # We make this call with temperature 0 to have it as deterministic as possible.
             generation_options: GenerationOptions = generation_options_var.get()
-            with llm_params(
-                llm, **((generation_options and generation_options.llm_params) or {})
-            ):
-                result = await llm_call(llm, prompt)
+            llm_params = (generation_options and generation_options.llm_params) or {}
+            result = await llm_call(llm, prompt, llm_params=llm_params)
 
             result = self.llm_task_manager.parse_task_output(
                 Task.GENERAL, output=result
 
@@ -75,11 +75,28 @@ async def llm_call(
     model_provider: Optional[str] = None,
     stop: Optional[List[str]] = None,
     custom_callback_handlers: Optional[List[AsyncCallbackHandler]] = None,
+    llm_params: Optional[dict] = None,
 ) -> str:
-    """Calls the LLM with a prompt and returns the generated text."""
+    """Calls the LLM with a prompt and returns the generated text.
+
+    Args:
+        llm: The language model instance to use
+        prompt: The prompt string or list of messages
+        model_name: Optional model name for tracking
+        model_provider: Optional model provider for tracking
+        stop: Optional list of stop tokens
+        custom_callback_handlers: Optional list of callback handlers
+        llm_params: Optional configuration dictionary to pass to the LLM (e.g., temperature, max_tokens)
+
+    Returns:
+        The generated text response
+    """
     _setup_llm_call_info(llm, model_name, model_provider)
     all_callbacks = _prepare_callbacks(custom_callback_handlers)
 
+    if llm_params and llm is not None:
+        llm = llm.bind(**llm_params)
+
     if isinstance(prompt, str):
         response = await _invoke_with_string_prompt(llm, prompt, all_callbacks, stop)
     else: