From a599f69cdbc58317cab3de6b4a1ef4448caabbfc Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 11:35:00 +0200 Subject: [PATCH 01/77] Add Modal orchestrator with step operator and orchestrator flavors --- src/zenml/integrations/modal/__init__.py | 14 +- .../integrations/modal/flavors/__init__.py | 8 + .../flavors/modal_orchestrator_flavor.py | 157 ++++ .../modal/orchestrators/__init__.py | 20 + .../modal/orchestrators/modal_orchestrator.py | 689 ++++++++++++++++++ 5 files changed, 883 insertions(+), 5 deletions(-) create mode 100644 src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py create mode 100644 src/zenml/integrations/modal/orchestrators/__init__.py create mode 100644 src/zenml/integrations/modal/orchestrators/modal_orchestrator.py diff --git a/src/zenml/integrations/modal/__init__.py b/src/zenml/integrations/modal/__init__.py index 081628cb035..8feee188a15 100644 --- a/src/zenml/integrations/modal/__init__.py +++ b/src/zenml/integrations/modal/__init__.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Modal integration for cloud-native step execution. +"""Modal integration for cloud-native step execution and orchestration. -The Modal integration sub-module provides a step operator flavor that allows -executing steps on Modal's cloud infrastructure. +The Modal integration sub-module provides a step operator flavor and an orchestrator +flavor that allow executing steps and complete pipelines on Modal's cloud infrastructure. """ from typing import List, Type @@ -22,6 +22,7 @@ from zenml.integrations.integration import Integration from zenml.stack import Flavor +MODAL_ORCHESTRATOR_FLAVOR = "modal" MODAL_STEP_OPERATOR_FLAVOR = "modal" @@ -38,8 +39,11 @@ def flavors(cls) -> List[Type[Flavor]]: Returns: List of new stack component flavors. """ - from zenml.integrations.modal.flavors import ModalStepOperatorFlavor + from zenml.integrations.modal.flavors import ( + ModalOrchestratorFlavor, + ModalStepOperatorFlavor, + ) - return [ModalStepOperatorFlavor] + return [ModalOrchestratorFlavor, ModalStepOperatorFlavor] diff --git a/src/zenml/integrations/modal/flavors/__init__.py b/src/zenml/integrations/modal/flavors/__init__.py index 472e0bcb4f8..aee910296e1 100644 --- a/src/zenml/integrations/modal/flavors/__init__.py +++ b/src/zenml/integrations/modal/flavors/__init__.py @@ -13,6 +13,11 @@ # permissions and limitations under the License. """Modal integration flavors.""" +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorConfig, + ModalOrchestratorFlavor, + ModalOrchestratorSettings, +) from zenml.integrations.modal.flavors.modal_step_operator_flavor import ( ModalStepOperatorConfig, ModalStepOperatorFlavor, @@ -20,6 +25,9 @@ ) __all__ = [ + "ModalOrchestratorConfig", + "ModalOrchestratorFlavor", + "ModalOrchestratorSettings", "ModalStepOperatorConfig", "ModalStepOperatorFlavor", "ModalStepOperatorSettings", diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py new file mode 100644 index 00000000000..87e88cd8551 --- /dev/null +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -0,0 +1,157 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Modal orchestrator flavor.""" + +from typing import TYPE_CHECKING, Optional, Type + +from pydantic import SecretStr + +from zenml.config.base_settings import BaseSettings +from zenml.orchestrators import BaseOrchestratorConfig, BaseOrchestratorFlavor + +if TYPE_CHECKING: + from zenml.integrations.modal.orchestrators import ModalOrchestrator + +MODAL_ORCHESTRATOR_FLAVOR = "modal" + + +class ModalOrchestratorSettings(BaseSettings): + """Modal orchestrator settings. + + Attributes: + gpu: The type of GPU to use for the pipeline execution. + region: The region to use for the pipeline execution. + cloud: The cloud provider to use for the pipeline execution. + environment: The Modal environment to use for the pipeline execution. + cpu_count: Number of CPU cores to allocate. + memory_mb: Memory in MB to allocate. + timeout: Maximum execution time in seconds (default 24h). + min_containers: Minimum containers to keep warm (replaces keep_warm). + max_containers: Maximum concurrent containers (replaces concurrency_limit). + execution_mode: Execution mode - "pipeline" (default, fastest) or "per_step" (granular control). + synchronous: Wait for completion (True) or fire-and-forget (False). + """ + + gpu: Optional[str] = None + region: Optional[str] = None + cloud: Optional[str] = None + environment: Optional[str] = None + cpu_count: Optional[int] = ( + 32 # Default 32 CPU cores for blazing fast execution + ) + memory_mb: Optional[int] = 65536 # Default 64GB RAM for maximum speed + timeout: int = 86400 # 24 hours (Modal's maximum) + min_containers: Optional[int] = ( + 1 # Keep 1 container warm for sequential execution + ) + max_containers: Optional[int] = 10 # Allow up to 10 concurrent containers + execution_mode: str = "pipeline" # Default to fastest mode + synchronous: bool = ( + True # Wait for completion (True) or fire-and-forget (False) + ) + + +class ModalOrchestratorConfig( + BaseOrchestratorConfig, ModalOrchestratorSettings +): + """Modal orchestrator config optimized for BLAZING FAST execution. + + Attributes: + token: Modal API token for authentication. If not provided, + falls back to Modal's default authentication (~/.modal.toml). + workspace: Modal workspace name (optional). + environment: Modal environment name (optional). + """ + + token: Optional[SecretStr] = None + workspace: Optional[str] = None + environment: Optional[str] = None + + @property + def is_remote(self) -> bool: + """Checks if this stack component is running remotely. + + Returns: + True since Modal runs remotely. + """ + return True + + @property + def is_synchronous(self) -> bool: + """Whether the orchestrator runs synchronous or not. + + Returns: + True since the orchestrator waits for completion. + """ + return True + + +class ModalOrchestratorFlavor(BaseOrchestratorFlavor): + """Flavor for the Modal orchestrator.""" + + @property + def name(self) -> str: + """Name of the orchestrator flavor. + + Returns: + Name of the orchestrator flavor. + """ + return MODAL_ORCHESTRATOR_FLAVOR + + @property + def docs_url(self) -> Optional[str]: + """A url to point at docs explaining this flavor. + + Returns: + A flavor docs url. + """ + return self.generate_default_docs_url() + + @property + def sdk_docs_url(self) -> Optional[str]: + """A url to point at SDK docs explaining this flavor. + + Returns: + A flavor SDK docs url. + """ + return self.generate_default_sdk_docs_url() + + @property + def logo_url(self) -> str: + """A url to represent the flavor in the dashboard. + + Returns: + The flavor logo. + """ + return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/orchestrator/modal.png" + + @property + def config_class(self) -> Type[ModalOrchestratorConfig]: + """Config class for the Modal orchestrator flavor. + + Returns: + The config class. + """ + return ModalOrchestratorConfig + + @property + def implementation_class(self) -> Type["ModalOrchestrator"]: + """Implementation class for this flavor. + + Returns: + Implementation class for this flavor. + """ + from zenml.integrations.modal.orchestrators import ModalOrchestrator + + return ModalOrchestrator diff --git a/src/zenml/integrations/modal/orchestrators/__init__.py b/src/zenml/integrations/modal/orchestrators/__init__.py new file mode 100644 index 00000000000..a83f67c36ca --- /dev/null +++ b/src/zenml/integrations/modal/orchestrators/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Modal orchestrator implementation.""" + +from zenml.integrations.modal.orchestrators.modal_orchestrator import ( + ModalOrchestrator, +) + +__all__ = ["ModalOrchestrator"] \ No newline at end of file diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py new file mode 100644 index 00000000000..a7dda222580 --- /dev/null +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -0,0 +1,689 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Implementation of a Modal orchestrator.""" + +import os +import time +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast +from uuid import uuid4 + +try: + import modal +except ImportError: + modal = None # type: ignore + +from zenml.config.base_settings import BaseSettings +from zenml.config.build_configuration import BuildConfiguration +from zenml.config.resource_settings import ByteUnit, ResourceSettings +from zenml.enums import StackComponentType +from zenml.logger import get_logger +from zenml.orchestrators import ContainerizedOrchestrator +from zenml.stack import Stack, StackValidator +from zenml.utils import string_utils + +if TYPE_CHECKING: + from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorConfig, + ModalOrchestratorSettings, + ) + from zenml.models import PipelineDeploymentResponse, PipelineRunResponse + from zenml.models.v2.core.pipeline_deployment import PipelineDeploymentBase + +logger = get_logger(__name__) + +ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID = "ZENML_MODAL_ORCHESTRATOR_RUN_ID" + + +def run_step_in_modal( + step_name: str, + deployment_id: str, + orchestrator_run_id: str, +) -> None: + """Execute a single ZenML step in Modal. + + Args: + step_name: Name of the step to execute. + deployment_id: ID of the pipeline deployment. + orchestrator_run_id: ID of the orchestrator run. + + Raises: + Exception: If step execution fails. + """ + import os + import sys + + print(f"🚀 Running step '{step_name}' in Modal") + sys.stdout.flush() + + # Set the orchestrator run ID in the Modal environment + os.environ["ZENML_MODAL_ORCHESTRATOR_RUN_ID"] = orchestrator_run_id + + try: + from zenml.entrypoints.step_entrypoint_configuration import ( + StepEntrypointConfiguration, + ) + + print( + f"🔧 Executing step '{step_name}' directly in process for maximum speed" + ) + sys.stdout.flush() + + # Create the entrypoint arguments + args = StepEntrypointConfiguration.get_entrypoint_arguments( + step_name=step_name, deployment_id=deployment_id + ) + + # Create the configuration and run the step + config = StepEntrypointConfiguration(arguments=args) + config.run() + + print(f"✅ Step {step_name} completed successfully") + sys.stdout.flush() + + except Exception as e: + import traceback + + error_details = traceback.format_exc() + print(f"💥 Error executing step {step_name}: {e}") + print(f"📝 Full traceback:\n{error_details}") + sys.stdout.flush() + raise + + +def run_entire_pipeline( + deployment_id: str, + orchestrator_run_id: str, +) -> None: + """Execute entire pipeline using PipelineEntrypointConfiguration for maximum efficiency. + + Args: + deployment_id: ID of the pipeline deployment. + orchestrator_run_id: ID of the orchestrator run. + + Raises: + Exception: If pipeline execution fails. + """ + import os + import time + + print( + "🚀 [MODAL] Starting ENTIRE PIPELINE using PipelineEntrypointConfiguration!", + flush=True, + ) + print(f"📝 [MODAL] Deployment ID: {deployment_id}", flush=True) + print(f"🆔 [MODAL] Orchestrator Run ID: {orchestrator_run_id}", flush=True) + print(f"⏰ [MODAL] Start time: {time.strftime('%H:%M:%S')}", flush=True) + + # Set the orchestrator run ID in the Modal environment + os.environ["ZENML_MODAL_ORCHESTRATOR_RUN_ID"] = orchestrator_run_id + + try: + from zenml.entrypoints.pipeline_entrypoint_configuration import ( + PipelineEntrypointConfiguration, + ) + + print( + "🔧 [MODAL] Initializing pipeline entrypoint configuration...", + flush=True, + ) + + # Create the entrypoint arguments + args = PipelineEntrypointConfiguration.get_entrypoint_arguments( + deployment_id=deployment_id + ) + + print("⚙️ [MODAL] Creating pipeline configuration...", flush=True) + config = PipelineEntrypointConfiguration(arguments=args) + + print("🏃 [MODAL] Executing entire pipeline...", flush=True) + config.run() + + print("🎉 [MODAL] ENTIRE PIPELINE COMPLETED SUCCESSFULLY!", flush=True) + + except Exception as e: + import traceback + + error_details = traceback.format_exc() + print(f"💥 [MODAL] Error executing pipeline: {e}", flush=True) + print(f"📝 [MODAL] Full traceback:\n{error_details}", flush=True) + raise + + +def get_gpu_values( + settings: "ModalOrchestratorSettings", resource_settings: ResourceSettings +) -> Optional[str]: + """Get the GPU values for the Modal orchestrator. + + Args: + settings: The Modal orchestrator settings. + resource_settings: The resource settings. + + Returns: + The GPU string if a count is specified, otherwise the GPU type. + """ + if not settings.gpu: + return None + # Prefer resource_settings gpu_count, fallback to 1 + gpu_count = resource_settings.gpu_count or 1 + return f"{settings.gpu}:{gpu_count}" if gpu_count > 1 else settings.gpu + + +def get_resource_values( + config: "ModalOrchestratorConfig", resource_settings: ResourceSettings +) -> Tuple[Optional[int], Optional[int]]: + """Get CPU and memory values with config fallbacks. + + Args: + config: The Modal orchestrator config. + resource_settings: The resource settings. + + Returns: + Tuple of (cpu_count, memory_mb) with config fallbacks. + """ + # Prefer pipeline resource settings, fallback to config defaults + cpu_count_raw = resource_settings.cpu_count or config.cpu_count + cpu_count: Optional[int] = None + if cpu_count_raw is not None: + cpu_count = int(cpu_count_raw) + + # Convert memory to MB if needed + memory_mb: Optional[int] = config.memory_mb + if resource_settings.memory: + memory_value = resource_settings.get_memory(ByteUnit.MB) + if memory_value is not None: + memory_mb = int(memory_value) + + return cpu_count, memory_mb + + +def get_or_deploy_persistent_modal_app( + pipeline_name: str, + zenml_image: Any, + gpu_values: Optional[str], + cpu_count: Optional[int], + memory_mb: Optional[int], + cloud: Optional[str], + region: Optional[str], + timeout: int, + min_containers: Optional[int], + max_containers: Optional[int], + environment_name: Optional[str] = None, + execution_mode: str = "single_function", +) -> Any: + """Get or deploy a persistent Modal app with warm containers. + + This function deploys a Modal app that stays alive with warm containers + for maximum speed between pipeline runs. + + Args: + pipeline_name: Name of the pipeline. + zenml_image: Pre-built ZenML Docker image for Modal. + gpu_values: GPU configuration string. + cpu_count: Number of CPU cores. + memory_mb: Memory allocation in MB. + cloud: Cloud provider to use. + region: Region to deploy in. + timeout: Maximum execution timeout. + min_containers: Minimum containers to keep warm. + max_containers: Maximum containers to scale to. + environment_name: Modal environment name. + execution_mode: Execution mode for the function. + + Returns: + The Modal function ready for execution. + + Raises: + Exception: If deployment fails. + """ + # Use pipeline name + execution mode + 2-hour window for app reuse + # This ensures apps get redeployed every 2 hours to refresh tokens + mode_suffix = execution_mode.replace("_", "-") + + # Create a 2-hour timestamp window (rounds down to nearest 2-hour boundary) + import time + + current_time = int(time.time()) + two_hour_window = current_time // (2 * 3600) # 2 hours = 7200 seconds + + app_name = f"zenml-{pipeline_name.replace('_', '-')}-{mode_suffix}-{two_hour_window}" + + logger.info(f"🏗️ Getting/deploying persistent Modal app: {app_name}") + + # Create the app + app = modal.App(app_name) + + # Ensure we have minimum containers for fast startup + effective_min_containers = min_containers or 1 + effective_max_containers = max_containers or 10 + + # Create the execution function based on execution mode + if execution_mode == "per_step": + logger.info("🔧 Creating per-step mode for granular execution") + execution_func: Any = run_step_in_modal + function_name = "run_step_in_modal" + else: + logger.info("🚀 Creating pipeline mode for MAXIMUM SPEED!") + execution_func = run_entire_pipeline + function_name = "run_entire_pipeline" + + execute_step_func = app.function( + image=zenml_image, + gpu=gpu_values, + cpu=cpu_count, + memory=memory_mb, + cloud=cloud, + region=region, + timeout=timeout, + min_containers=effective_min_containers, # Keep containers warm for speed + max_containers=effective_max_containers, # Allow scaling + )(execution_func) + + # Try to lookup existing app in current 2-hour window, deploy if not found + try: + logger.info( + f"🔍 Checking for Modal app in current 2-hour window: {app_name}" + ) + + try: + modal.App.lookup( + app_name, environment_name=environment_name or "main" + ) + logger.info( + f"♻️ Found existing app '{app_name}' with fresh tokens - reusing warm containers!" + ) + + # Try to get the function directly + try: + existing_function = modal.Function.from_name( + app_name, + function_name, + environment_name=environment_name or "main", + ) + logger.info( + "✅ Successfully retrieved function from existing app!" + ) + return existing_function + except Exception as func_error: + logger.warning( + f"⚠️ Function lookup failed: {func_error}, redeploying..." + ) + # Fall through to deployment + + except Exception: + # App not found or other lookup error - deploy fresh app + logger.info( + "🆕 No app found for current 2-hour window, deploying fresh app..." + ) + + # Deploy the app + app.deploy(name=app_name, environment_name=environment_name or "main") + logger.info( + f"✅ App '{app_name}' deployed with fresh tokens and {effective_min_containers} warm containers" + ) + logger.info( + f"📱 View real-time logs at: https://modal.com/apps/{app_name}" + ) + + except Exception as e: + logger.error(f"❌ Deployment failed: {e}") + raise + + logger.info( + f"🔥 Modal app configured for SPEED with min_containers={effective_min_containers}, max_containers={effective_max_containers}" + ) + logger.info( + f"💡 This means {effective_min_containers} containers will stay warm for faster execution!" + ) + + return execute_step_func + + +class ModalOrchestrator(ContainerizedOrchestrator): + """Orchestrator responsible for running entire pipelines on Modal. + + This orchestrator runs complete pipelines in a single Modal function + for maximum speed and efficiency, avoiding the overhead of multiple + step executions. + """ + + @property + def config(self) -> "ModalOrchestratorConfig": + """Returns the Modal orchestrator config. + + Returns: + The Modal orchestrator config. + """ + return cast("ModalOrchestratorConfig", self._config) + + @property + def settings_class(self) -> Optional[Type["BaseSettings"]]: + """Settings class for the Modal orchestrator. + + Returns: + The settings class. + """ + from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorSettings, + ) + + return ModalOrchestratorSettings + + def _setup_modal_client(self) -> None: + """Setup Modal client with authentication.""" + if self.config.token: + # Set Modal token from config + os.environ["MODAL_TOKEN_ID"] = self.config.token.get_secret_value() + logger.info("Using Modal token from orchestrator config") + else: + logger.info("Using default Modal authentication (~/.modal.toml)") + + # Set workspace/environment if provided + if self.config.workspace: + os.environ["MODAL_WORKSPACE"] = self.config.workspace + if self.config.environment: + os.environ["MODAL_ENVIRONMENT"] = self.config.environment + + @property + def validator(self) -> Optional[StackValidator]: + """Ensures there is a container registry and artifact store in the stack. + + Returns: + A `StackValidator` instance. + """ + + def _validate_remote_components(stack: "Stack") -> tuple[bool, str]: + if stack.artifact_store.config.is_local: + return False, ( + "The Modal orchestrator runs code remotely and " + "needs to write files into the artifact store, but the " + f"artifact store `{stack.artifact_store.name}` of the " + "active stack is local. Please ensure that your stack " + "contains a remote artifact store when using the Modal " + "orchestrator." + ) + + container_registry = stack.container_registry + assert container_registry is not None + + if container_registry.config.is_local: + return False, ( + "The Modal orchestrator runs code remotely and " + "needs to push/pull Docker images, but the " + f"container registry `{container_registry.name}` of the " + "active stack is local. Please ensure that your stack " + "contains a remote container registry when using the " + "Modal orchestrator." + ) + + return True, "" + + return StackValidator( + required_components={ + StackComponentType.CONTAINER_REGISTRY, + StackComponentType.IMAGE_BUILDER, + }, + custom_validation_function=_validate_remote_components, + ) + + def get_orchestrator_run_id(self) -> str: + """Returns the active orchestrator run id. + + Raises: + RuntimeError: If the environment variable specifying the run id + is not set. + + Returns: + The orchestrator run id. + """ + try: + return os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] + except KeyError: + raise RuntimeError( + "Unable to read run id from environment variable " + f"{ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID}." + ) + + def get_docker_builds( + self, deployment: "PipelineDeploymentBase" + ) -> List[BuildConfiguration]: + """Get the Docker build configurations for the Modal orchestrator. + + Args: + deployment: The pipeline deployment. + + Returns: + A list of Docker build configurations. + """ + # Use the standard containerized orchestrator build logic + # This ensures ZenML builds the image with all pipeline code + return super().get_docker_builds(deployment) + + def _build_modal_image( + self, + deployment: "PipelineDeploymentResponse", + stack: "Stack", + environment: Dict[str, str], + ) -> Any: + """Build the Modal image for pipeline execution. + + Args: + deployment: The pipeline deployment. + stack: The stack the pipeline will run on. + environment: Environment variables to set. + + Returns: + The configured Modal image. + + Raises: + RuntimeError: If no Docker credentials are found. + ValueError: If no container registry is found. + """ + # Get the ZenML-built image that contains all pipeline code + image_name = self.get_image(deployment=deployment) + + if not stack.container_registry: + raise ValueError( + "No Container registry found in the stack. " + "Please add a container registry and ensure " + "it is correctly configured." + ) + + if docker_creds := stack.container_registry.credentials: + docker_username, docker_password = docker_creds + else: + raise RuntimeError( + "No Docker credentials found for the container registry." + ) + + # Create Modal secret for registry authentication + registry_secret = modal.Secret.from_dict( + { + "REGISTRY_USERNAME": docker_username, + "REGISTRY_PASSWORD": docker_password, + } + ) + + # Build Modal image from the ZenML-built image + # Use from_registry to pull the ZenML image with authentication + # and install Modal dependencies + zenml_image = ( + modal.Image.from_registry(image_name, secret=registry_secret) + .pip_install("modal") # Install Modal in the container + .env(environment) + ) + + return zenml_image + + def prepare_or_run_pipeline( + self, + deployment: "PipelineDeploymentResponse", + stack: "Stack", + environment: Dict[str, str], + placeholder_run: Optional["PipelineRunResponse"] = None, + ) -> Any: + """Runs the complete pipeline in a single Modal function. + + Args: + deployment: The pipeline deployment to prepare or run. + stack: The stack the pipeline will run on. + environment: Environment variables to set in the orchestration + environment. + placeholder_run: An optional placeholder run for the deployment (unused). + + Raises: + RuntimeError: If Modal is not installed or if a step fails. + Exception: If pipeline execution fails. + """ + _ = placeholder_run # Mark as intentionally unused + if modal is None: + raise RuntimeError( + "Modal is not installed. Please install it with: pip install modal" + ) + if deployment.schedule: + logger.warning( + "Modal Orchestrator currently does not support the " + "use of schedules. The `schedule` will be ignored " + "and the pipeline will be run immediately." + ) + + # Setup Modal authentication + self._setup_modal_client() + + # Generate orchestrator run ID + orchestrator_run_id = str(uuid4()) + environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id + + # Get settings from the first step (all steps use same Modal resources) + first_step = list(deployment.step_configurations.values())[0] + settings = cast( + "ModalOrchestratorSettings", self.get_settings(first_step) + ) + resource_settings = first_step.config.resource_settings + + # Build Modal image + zenml_image = self._build_modal_image(deployment, stack, environment) + + # Configure resources with config fallbacks + gpu_values = get_gpu_values(settings, resource_settings) + cpu_count, memory_mb = get_resource_values( + self.config, resource_settings + ) + + start_time = time.time() + + # Execute steps using Modal's fast container spin-up with PERSISTENT app + logger.info( + "🚀 Starting pipeline execution with PERSISTENT Modal functions..." + ) + + step_names = list(deployment.step_configurations.keys()) + logger.info(f"📋 Found {len(step_names)} steps: {step_names}") + + # Get or deploy persistent Modal app with BLAZING FAST warm containers + execute_step = get_or_deploy_persistent_modal_app( + pipeline_name=deployment.pipeline_configuration.name, + zenml_image=zenml_image, + gpu_values=gpu_values, + cpu_count=cpu_count or 8, # Default to 8 CPU cores for speed + memory_mb=memory_mb or 16384, # Default to 16GB RAM for speed + cloud=settings.cloud or self.config.cloud, + region=settings.region or self.config.region, + timeout=settings.timeout or self.config.timeout, + min_containers=settings.min_containers + or self.config.min_containers + or 1, # Keep 1 warm container for sequential execution + max_containers=settings.max_containers + or self.config.max_containers + or 10, # Scale to 10 containers + environment_name=settings.environment + or self.config.environment, # Use environment from config/settings + execution_mode=settings.execution_mode + or self.config.execution_mode, # Use execution mode from settings + ) + + logger.info( + "⚡ Executing with DEPLOYED Modal app and warm containers..." + ) + + # Execute based on execution mode with improved Modal Function API usage + execution_mode = settings.execution_mode or self.config.execution_mode + sync_execution = ( + settings.synchronous + if hasattr(settings, "synchronous") + else self.config.synchronous + ) + + def execute_modal_function( + func_args: Tuple[Any, ...], description: str + ) -> Any: + """Execute Modal function with proper sync/async control. + + Args: + func_args: Arguments to pass to the Modal function. + description: Description of the operation for logging. + + Returns: + Result of the Modal function execution. + """ + logger.info(f"🚀 {description}") + + if sync_execution: + logger.info("⚡ Using .remote() for synchronous execution") + # .remote() waits for completion but doesn't stream logs + result = execute_step.remote(*func_args) + logger.info(f"✅ {description} completed successfully!") + return result + else: + logger.info( + "🔥 Using .spawn() for asynchronous fire-and-forget execution" + ) + # .spawn() for fire-and-forget (async) + function_call = execute_step.spawn(*func_args) + logger.info( + f"🚀 {description} started asynchronously (not waiting for completion)" + ) + return function_call + + if execution_mode == "per_step": + logger.info("🔧 Using per-step mode for granular execution...") + # Execute steps individually + for step_name in step_names: + try: + execute_modal_function( + (step_name, deployment.id, orchestrator_run_id), + f"Step '{step_name}' execution", + ) + except Exception as e: + logger.error(f"❌ Step '{step_name}' failed: {e}") + logger.error("💡 Check Modal dashboard for detailed logs") + raise + else: + # Default: execute entire pipeline in one function + try: + execute_modal_function( + (deployment.id, orchestrator_run_id), + "Pipeline execution (MAXIMUM SPEED)", + ) + except Exception as e: + logger.error(f"❌ Pipeline failed: {e}") + logger.error("💡 Check Modal dashboard for detailed logs") + raise + + run_duration = time.time() - start_time + + # Log completion + logger.info( + "Pipeline run has finished in `%s`.", + string_utils.get_human_readable_time(run_duration), + ) From b122c6346e56f2d63a31b08768a22e4c4c09e13b Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 11:42:08 +0200 Subject: [PATCH 02/77] Add Modal Orchestrator Integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adds new Modal orchestrator flavor for serverless pipeline execution - Implements optimized execution modes: pipeline (default) and per_step - Supports GPU/CPU resource configuration with intelligent defaults - Features persistent apps with warm containers for fast execution - Includes comprehensive documentation and examples - Simplifies execution model by removing redundant single_function mode 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .../component-guide/orchestrators/modal.md | 298 ++++++++++++++++++ docs/book/component-guide/toc.md | 1 + 2 files changed, 299 insertions(+) create mode 100644 docs/book/component-guide/orchestrators/modal.md diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md new file mode 100644 index 00000000000..84eb1fac5c9 --- /dev/null +++ b/docs/book/component-guide/orchestrators/modal.md @@ -0,0 +1,298 @@ +--- +description: Orchestrating your pipelines to run on Modal's serverless cloud platform. +--- + +# Modal Orchestrator + +Using the ZenML `modal` integration, you can orchestrate and scale your ML pipelines on [Modal's](https://modal.com/) serverless cloud platform with minimal setup and maximum efficiency. + +The Modal orchestrator is designed for speed and cost-effectiveness, running entire pipelines in single serverless functions to minimize cold starts and optimize resource utilization. + +{% hint style="warning" %} +This component is only meant to be used within the context of a [remote ZenML deployment scenario](https://docs.zenml.io/getting-started/deploying-zenml/). Usage with a local ZenML deployment may lead to unexpected behavior! +{% endhint %} + +## When to use it + +You should use the Modal orchestrator if: + +* you want a serverless solution that scales to zero when not in use. +* you're looking for fast pipeline execution with minimal cold start overhead. +* you want cost-effective ML pipeline orchestration without managing infrastructure. +* you need easy access to GPUs and high-performance computing resources. +* you prefer a simple setup process without complex Kubernetes configurations. + +## How to deploy it + +The Modal orchestrator runs on Modal's cloud infrastructure, so you don't need to deploy or manage any servers. You just need: + +1. A [Modal account](https://modal.com/) (free tier available) +2. Modal CLI installed and authenticated +3. A [remote ZenML deployment](https://docs.zenml.io/getting-started/deploying-zenml/) for production use + +## How to use it + +To use the Modal orchestrator, we need: + +* The ZenML `modal` integration installed. If you haven't done so, run: + ```shell + zenml integration install modal + ``` +* [Docker](https://www.docker.com) installed and running. +* A [remote artifact store](../artifact-stores/README.md) as part of your stack. +* A [remote container registry](../container-registries/README.md) as part of your stack. +* Modal CLI installed and authenticated: + ```shell + pip install modal + modal setup + ``` + +### Setting up the orchestrator + +You can register the orchestrator with or without explicit Modal credentials: + +**Option 1: Using Modal CLI authentication (recommended for development)** + +```shell +# Register the orchestrator (uses Modal CLI credentials) +zenml orchestrator register \ + --flavor=modal \ + --synchronous=true + +# Register and activate a stack with the new orchestrator +zenml stack register -o ... --set +``` + +**Option 2: Using Modal API token (recommended for production)** + +```shell +# Register the orchestrator with explicit credentials +zenml orchestrator register \ + --flavor=modal \ + --token= \ + --workspace= \ + --synchronous=true + +# Register and activate a stack with the new orchestrator +zenml stack register -o ... --set +``` + +You can get your Modal token from the [Modal dashboard](https://modal.com/settings/tokens). + +{% hint style="info" %} +ZenML will build a Docker image called `/zenml:` which includes your code and use it to run your pipeline steps in Modal functions. Check out [this page](https://docs.zenml.io/how-to/customize-docker-builds/) if you want to learn more about how ZenML builds these images and how you can customize them. +{% endhint %} + +You can now run any ZenML pipeline using the Modal orchestrator: + +```shell +python file_that_runs_a_zenml_pipeline.py +``` + +### Modal UI + +Modal provides an excellent web interface where you can monitor your pipeline runs in real-time, view logs, and track resource usage. + +You can access the Modal dashboard at [modal.com/apps](https://modal.com/apps) to see your running and completed functions. + +### Additional configuration + +For additional configuration of the Modal orchestrator, you can pass `ModalOrchestratorSettings` which allows you to configure resource requirements, execution modes, and cloud preferences: + +```python +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorSettings +) + +modal_settings = ModalOrchestratorSettings( + cpu_count=16, # Number of CPU cores + memory_mb=32768, # 32GB RAM + gpu="A100", # GPU type (optional) + region="us-east-1", # Preferred region + cloud="aws", # Cloud provider + execution_mode="pipeline", # or "per_step" + timeout=3600, # 1 hour timeout + min_containers=1, # Keep warm containers + max_containers=10, # Scale up to 10 containers +) + +@pipeline( + settings={ + "orchestrator": modal_settings + } +) +def my_modal_pipeline(): + # Your pipeline steps here + ... +``` + +### Resource configuration + +You can specify different resource requirements for individual steps: + +```python +from zenml.config import ResourceSettings + +# Configure resources for a specific step +@step( + settings={ + "resources": ResourceSettings( + cpu_count=8, + memory="16GB", + gpu_count=1 + ), + "orchestrator": ModalOrchestratorSettings( + gpu="T4", + region="us-west-2" + ) + } +) +def gpu_training_step(): + # This step will run on a GPU + ... + +@step( + settings={ + "resources": ResourceSettings( + cpu_count=32, + memory="64GB" + ) + } +) +def cpu_intensive_step(): + # This step will run with high CPU/memory + ... + +@pipeline() +def my_pipeline(): + gpu_training_step() + cpu_intensive_step() +``` + +### Execution modes + +The Modal orchestrator supports two execution modes: + +1. **`pipeline` (default)**: Runs the entire pipeline in a single Modal function for maximum speed and cost efficiency +2. **`per_step`**: Runs each step in a separate Modal function for granular control and debugging + +```python +# Fast execution (default) - entire pipeline in one function +modal_settings = ModalOrchestratorSettings( + execution_mode="pipeline" +) + +# Granular execution - each step separate (useful for debugging) +modal_settings = ModalOrchestratorSettings( + execution_mode="per_step" +) +``` + +### Using GPUs + +Modal makes it easy to use GPUs for your ML workloads: + +```python +@step( + settings={ + "orchestrator": ModalOrchestratorSettings( + gpu="A100", # or "T4", "V100", etc. + region="us-east-1" + ), + "resources": ResourceSettings( + gpu_count=1 + ) + } +) +def train_model(): + # Your GPU-accelerated training code + import torch + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + ... +``` + +Available GPU types include: +- `T4` - Cost-effective for inference and light training +- `A10G` - Balanced performance for training and inference +- `A100` - High-performance for large model training +- `H100` - Latest generation for maximum performance + +### Synchronous vs Asynchronous execution + +You can choose whether to wait for pipeline completion or run asynchronously: + +```python +# Wait for completion (default) +modal_settings = ModalOrchestratorSettings( + synchronous=True +) + +# Fire-and-forget execution +modal_settings = ModalOrchestratorSettings( + synchronous=False +) +``` + +### Authentication with different environments + +For production deployments, you can specify different Modal environments: + +```python +modal_settings = ModalOrchestratorSettings( + environment="production", # or "staging", "dev", etc. + workspace="my-company" +) +``` + +### Warm containers for faster execution + +Modal orchestrator uses persistent apps with warm containers to minimize cold starts: + +```python +modal_settings = ModalOrchestratorSettings( + min_containers=2, # Keep 2 containers warm + max_containers=20, # Scale up to 20 containers +) +``` + +This ensures your pipelines start executing immediately without waiting for container initialization. + +## Best practices + +1. **Use pipeline mode for production**: The default `pipeline` execution mode runs your entire pipeline in one function, minimizing overhead and cost. + +2. **Configure appropriate timeouts**: Set realistic timeouts for your workloads: + ```python + modal_settings = ModalOrchestratorSettings( + timeout=7200 # 2 hours + ) + ``` + +3. **Choose the right region**: Select regions close to your data sources to minimize transfer costs and latency. + +4. **Use appropriate GPU types**: Match GPU types to your workload requirements - don't use A100s for simple inference tasks. + +5. **Monitor resource usage**: Use Modal's dashboard to track your resource consumption and optimize accordingly. + +## Troubleshooting + +### Common issues + +1. **Authentication errors**: Ensure your Modal token is correctly configured and has the necessary permissions. + +2. **Image build failures**: Check that your Docker registry credentials are properly configured in your ZenML stack. + +3. **Resource limits**: If you hit resource limits, consider breaking large steps into smaller ones or requesting quota increases from Modal. + +4. **Network timeouts**: For long-running steps, ensure your timeout settings are appropriate. + +### Getting help + +- Check the [Modal documentation](https://modal.com/docs) for platform-specific issues +- Monitor your functions in the [Modal dashboard](https://modal.com/apps) +- Use `zenml logs` to view detailed pipeline execution logs + +For more information and a full list of configurable attributes of the Modal orchestrator, check out the [SDK Docs](https://sdkdocs.zenml.io/latest/integration_code_docs/integrations-modal.html#zenml.integrations.modal.orchestrators). + +
ZenML Scarf
\ No newline at end of file diff --git a/docs/book/component-guide/toc.md b/docs/book/component-guide/toc.md index 77233c00229..3e810916fbf 100644 --- a/docs/book/component-guide/toc.md +++ b/docs/book/component-guide/toc.md @@ -19,6 +19,7 @@ * [Skypilot VM Orchestrator](orchestrators/skypilot-vm.md) * [HyperAI Orchestrator](orchestrators/hyperai.md) * [Lightning AI Orchestrator](orchestrators/lightning.md) + * [Modal Orchestrator](orchestrators/modal.md) * [Develop a custom orchestrator](orchestrators/custom.md) * [Artifact Stores](artifact-stores/README.md) * [Local Artifact Store](artifact-stores/local.md) From cd6b59f0fb7434321520b1564893f228c26291c3 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 12:04:42 +0200 Subject: [PATCH 03/77] Add pipeline-wide resource settings for hardware resources --- .../component-guide/orchestrators/modal.md | 151 +++++++++--- .../flavors/modal_orchestrator_flavor.py | 9 +- .../modal/orchestrators/modal_orchestrator.py | 218 +++++++++--------- 3 files changed, 231 insertions(+), 147 deletions(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index 84eb1fac5c9..014d4e296fa 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -95,18 +95,38 @@ Modal provides an excellent web interface where you can monitor your pipeline ru You can access the Modal dashboard at [modal.com/apps](https://modal.com/apps) to see your running and completed functions. +### Configuration overview + +The Modal orchestrator uses two types of settings following ZenML's standard pattern: + +1. **`ResourceSettings`** (standard ZenML) - for hardware resource quantities: + - `cpu_count` - Number of CPU cores + - `memory` - Memory allocation (e.g., "16GB") + - `gpu_count` - Number of GPUs to allocate + +2. **`ModalOrchestratorSettings`** (Modal-specific) - for Modal platform configuration: + - `gpu` - GPU type specification (e.g., "T4", "A100", "H100") + - `region` - Cloud region preference + - `cloud` - Cloud provider selection + - `execution_mode` - How to run the pipeline + - `timeout`, `min_containers`, `max_containers` - Performance settings + +{% hint style="info" %} +**GPU Configuration**: Use `ResourceSettings.gpu_count` to specify how many GPUs you need, and `ModalOrchestratorSettings.gpu` to specify what type of GPU. Modal will combine these automatically (e.g., `gpu_count=2` + `gpu="A100"` becomes `"A100:2"`). +{% endhint %} + ### Additional configuration -For additional configuration of the Modal orchestrator, you can pass `ModalOrchestratorSettings` which allows you to configure resource requirements, execution modes, and cloud preferences: +Here's how to configure both types of settings: ```python from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( ModalOrchestratorSettings ) +from zenml.config import ResourceSettings +# Configure Modal-specific settings modal_settings = ModalOrchestratorSettings( - cpu_count=16, # Number of CPU cores - memory_mb=32768, # 32GB RAM gpu="A100", # GPU type (optional) region="us-east-1", # Preferred region cloud="aws", # Cloud provider @@ -116,9 +136,17 @@ modal_settings = ModalOrchestratorSettings( max_containers=10, # Scale up to 10 containers ) +# Configure hardware resources (quantities) +resource_settings = ResourceSettings( + cpu_count=16, # Number of CPU cores + memory="32GB", # 32GB RAM + gpu_count=1 # Number of GPUs (combined with gpu type below) +) + @pipeline( settings={ - "orchestrator": modal_settings + "orchestrator": modal_settings, + "resources": resource_settings } ) def my_modal_pipeline(): @@ -128,45 +156,46 @@ def my_modal_pipeline(): ### Resource configuration -You can specify different resource requirements for individual steps: +{% hint style="info" %} +**Pipeline-Level Resources**: The Modal orchestrator uses pipeline-level resource settings to configure the Modal function for the entire pipeline. All steps share the same Modal function resources. Configure resources at the `@pipeline` level for best results. +{% endhint %} + +You can configure pipeline-wide resource requirements using `ResourceSettings` for hardware resources and `ModalOrchestratorSettings` for Modal-specific configurations: ```python from zenml.config import ResourceSettings +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorSettings +) -# Configure resources for a specific step -@step( +# Configure resources at the pipeline level (recommended) +@pipeline( settings={ "resources": ResourceSettings( - cpu_count=8, - memory="16GB", - gpu_count=1 + cpu_count=16, + memory="32GB", + gpu_count=1 # These resources apply to the entire pipeline ), "orchestrator": ModalOrchestratorSettings( - gpu="T4", + gpu="A100", # GPU type for the entire pipeline region="us-west-2" ) } ) -def gpu_training_step(): - # This step will run on a GPU +def my_pipeline(): + first_step() # Runs with pipeline resources: 16 CPU, 32GB RAM, 1x A100 + second_step() # Runs with same resources: 16 CPU, 32GB RAM, 1x A100 ... -@step( - settings={ - "resources": ResourceSettings( - cpu_count=32, - memory="64GB" - ) - } -) -def cpu_intensive_step(): - # This step will run with high CPU/memory +@step +def first_step(): + # Uses pipeline-level resource configuration ... -@pipeline() -def my_pipeline(): - gpu_training_step() - cpu_intensive_step() +@step +def second_step(): + # Uses same pipeline-level resource configuration + ... ``` ### Execution modes @@ -174,7 +203,11 @@ def my_pipeline(): The Modal orchestrator supports two execution modes: 1. **`pipeline` (default)**: Runs the entire pipeline in a single Modal function for maximum speed and cost efficiency -2. **`per_step`**: Runs each step in a separate Modal function for granular control and debugging +2. **`per_step`**: Runs each step in a separate Modal function call for granular control and debugging + +{% hint style="info" %} +**Resource Sharing**: Both execution modes use the same Modal function with the same resource configuration (from pipeline-level settings). The difference is whether steps run sequentially in one function call (`pipeline`) or as separate function calls (`per_step`). +{% endhint %} ```python # Fast execution (default) - entire pipeline in one function @@ -190,22 +223,28 @@ modal_settings = ModalOrchestratorSettings( ### Using GPUs -Modal makes it easy to use GPUs for your ML workloads: +Modal makes it easy to use GPUs for your ML workloads. Use `ResourceSettings` to specify the number of GPUs and `ModalOrchestratorSettings` to specify the GPU type: ```python +from zenml.config import ResourceSettings +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorSettings +) + @step( settings={ + "resources": ResourceSettings( + gpu_count=1 # Number of GPUs to allocate + ), "orchestrator": ModalOrchestratorSettings( - gpu="A100", # or "T4", "V100", etc. + gpu="A100", # GPU type: "T4", "A10G", "A100", "H100" region="us-east-1" - ), - "resources": ResourceSettings( - gpu_count=1 ) } ) def train_model(): # Your GPU-accelerated training code + # Modal will provision 1x A100 GPU (gpu_count=1 + gpu="A100") import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") @@ -218,6 +257,34 @@ Available GPU types include: - `A100` - High-performance for large model training - `H100` - Latest generation for maximum performance +**Examples of GPU configurations (applied to entire pipeline):** + +```python +# Pipeline with GPU - configure on first step or pipeline level +@pipeline( + settings={ + "resources": ResourceSettings(gpu_count=1), + "orchestrator": ModalOrchestratorSettings(gpu="A100") + } +) +def gpu_pipeline(): + # All steps in this pipeline will have access to 1x A100 GPU + step_one() + step_two() + +# Multiple GPUs - configure at pipeline level +@pipeline( + settings={ + "resources": ResourceSettings(gpu_count=4), + "orchestrator": ModalOrchestratorSettings(gpu="A100") + } +) +def multi_gpu_pipeline(): + # All steps in this pipeline will have access to 4x A100 GPUs + training_step() + evaluation_step() +``` + ### Synchronous vs Asynchronous execution You can choose whether to wait for pipeline completion or run asynchronously: @@ -254,6 +321,14 @@ modal_settings = ModalOrchestratorSettings( min_containers=2, # Keep 2 containers warm max_containers=20, # Scale up to 20 containers ) + +@pipeline( + settings={ + "orchestrator": modal_settings + } +) +def my_pipeline(): + ... ``` This ensures your pipelines start executing immediately without waiting for container initialization. @@ -262,18 +337,20 @@ This ensures your pipelines start executing immediately without waiting for cont 1. **Use pipeline mode for production**: The default `pipeline` execution mode runs your entire pipeline in one function, minimizing overhead and cost. -2. **Configure appropriate timeouts**: Set realistic timeouts for your workloads: +2. **Separate resource and orchestrator settings**: Use `ResourceSettings` for hardware (CPU, memory, GPU count) and `ModalOrchestratorSettings` for Modal-specific configurations (GPU type, region, etc.). + +3. **Configure appropriate timeouts**: Set realistic timeouts for your workloads: ```python modal_settings = ModalOrchestratorSettings( timeout=7200 # 2 hours ) ``` -3. **Choose the right region**: Select regions close to your data sources to minimize transfer costs and latency. +4. **Choose the right region**: Select regions close to your data sources to minimize transfer costs and latency. -4. **Use appropriate GPU types**: Match GPU types to your workload requirements - don't use A100s for simple inference tasks. +5. **Use appropriate GPU types**: Match GPU types to your workload requirements - don't use A100s for simple inference tasks. -5. **Monitor resource usage**: Use Modal's dashboard to track your resource consumption and optimize accordingly. +6. **Monitor resource usage**: Use Modal's dashboard to track your resource consumption and optimize accordingly. ## Troubleshooting diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 87e88cd8551..9d0ebc6a6e5 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -30,12 +30,11 @@ class ModalOrchestratorSettings(BaseSettings): """Modal orchestrator settings. Attributes: - gpu: The type of GPU to use for the pipeline execution. + gpu: The type of GPU to use for the pipeline execution (e.g., "T4", "A100"). + Use ResourceSettings.gpu_count to specify the number of GPUs. region: The region to use for the pipeline execution. cloud: The cloud provider to use for the pipeline execution. environment: The Modal environment to use for the pipeline execution. - cpu_count: Number of CPU cores to allocate. - memory_mb: Memory in MB to allocate. timeout: Maximum execution time in seconds (default 24h). min_containers: Minimum containers to keep warm (replaces keep_warm). max_containers: Maximum concurrent containers (replaces concurrency_limit). @@ -47,10 +46,6 @@ class ModalOrchestratorSettings(BaseSettings): region: Optional[str] = None cloud: Optional[str] = None environment: Optional[str] = None - cpu_count: Optional[int] = ( - 32 # Default 32 CPU cores for blazing fast execution - ) - memory_mb: Optional[int] = 65536 # Default 64GB RAM for maximum speed timeout: int = 86400 # 24 hours (Modal's maximum) min_containers: Optional[int] = ( 1 # Keep 1 container warm for sequential execution diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index a7dda222580..dd0465e0b2d 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -14,8 +14,20 @@ """Implementation of a Modal orchestrator.""" import os +import sys import time -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast +import traceback +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Optional, + Tuple, + Type, + Union, + cast, +) from uuid import uuid4 try: @@ -23,9 +35,17 @@ except ImportError: modal = None # type: ignore +from zenml.config import ResourceSettings from zenml.config.base_settings import BaseSettings from zenml.config.build_configuration import BuildConfiguration -from zenml.config.resource_settings import ByteUnit, ResourceSettings +from zenml.config.constants import RESOURCE_SETTINGS_KEY +from zenml.config.resource_settings import ByteUnit +from zenml.entrypoints.pipeline_entrypoint_configuration import ( + PipelineEntrypointConfiguration, +) +from zenml.entrypoints.step_entrypoint_configuration import ( + StepEntrypointConfiguration, +) from zenml.enums import StackComponentType from zenml.logger import get_logger from zenml.orchestrators import ContainerizedOrchestrator @@ -60,22 +80,15 @@ def run_step_in_modal( Raises: Exception: If step execution fails. """ - import os - import sys - - print(f"🚀 Running step '{step_name}' in Modal") + logger.info(f"Running step '{step_name}' in Modal") sys.stdout.flush() # Set the orchestrator run ID in the Modal environment os.environ["ZENML_MODAL_ORCHESTRATOR_RUN_ID"] = orchestrator_run_id try: - from zenml.entrypoints.step_entrypoint_configuration import ( - StepEntrypointConfiguration, - ) - - print( - f"🔧 Executing step '{step_name}' directly in process for maximum speed" + logger.info( + f"Executing step '{step_name}' directly in process for maximum speed" ) sys.stdout.flush() @@ -88,15 +101,13 @@ def run_step_in_modal( config = StepEntrypointConfiguration(arguments=args) config.run() - print(f"✅ Step {step_name} completed successfully") + logger.info(f"Step {step_name} completed successfully") sys.stdout.flush() except Exception as e: - import traceback - error_details = traceback.format_exc() - print(f"💥 Error executing step {step_name}: {e}") - print(f"📝 Full traceback:\n{error_details}") + logger.error(f"Error executing step {step_name}: {e}") + logger.debug(f"Full traceback:\n{error_details}") sys.stdout.flush() raise @@ -114,49 +125,37 @@ def run_entire_pipeline( Raises: Exception: If pipeline execution fails. """ - import os - import time - - print( - "🚀 [MODAL] Starting ENTIRE PIPELINE using PipelineEntrypointConfiguration!", - flush=True, + logger.info( + "Starting entire pipeline using PipelineEntrypointConfiguration", + extra={ + "deployment_id": deployment_id, + "orchestrator_run_id": orchestrator_run_id, + }, ) - print(f"📝 [MODAL] Deployment ID: {deployment_id}", flush=True) - print(f"🆔 [MODAL] Orchestrator Run ID: {orchestrator_run_id}", flush=True) - print(f"⏰ [MODAL] Start time: {time.strftime('%H:%M:%S')}", flush=True) # Set the orchestrator run ID in the Modal environment os.environ["ZENML_MODAL_ORCHESTRATOR_RUN_ID"] = orchestrator_run_id try: - from zenml.entrypoints.pipeline_entrypoint_configuration import ( - PipelineEntrypointConfiguration, - ) - - print( - "🔧 [MODAL] Initializing pipeline entrypoint configuration...", - flush=True, - ) + logger.debug("Initializing pipeline entrypoint configuration") # Create the entrypoint arguments args = PipelineEntrypointConfiguration.get_entrypoint_arguments( deployment_id=deployment_id ) - print("⚙️ [MODAL] Creating pipeline configuration...", flush=True) + logger.debug("Creating pipeline configuration") config = PipelineEntrypointConfiguration(arguments=args) - print("🏃 [MODAL] Executing entire pipeline...", flush=True) + logger.info("Executing entire pipeline") config.run() - print("🎉 [MODAL] ENTIRE PIPELINE COMPLETED SUCCESSFULLY!", flush=True) + logger.info("Entire pipeline completed successfully") except Exception as e: - import traceback - error_details = traceback.format_exc() - print(f"💥 [MODAL] Error executing pipeline: {e}", flush=True) - print(f"📝 [MODAL] Full traceback:\n{error_details}", flush=True) + logger.error(f"Error executing pipeline: {e}") + logger.debug(f"Full traceback:\n{error_details}") raise @@ -180,25 +179,23 @@ def get_gpu_values( def get_resource_values( - config: "ModalOrchestratorConfig", resource_settings: ResourceSettings + resource_settings: ResourceSettings, ) -> Tuple[Optional[int], Optional[int]]: - """Get CPU and memory values with config fallbacks. + """Get CPU and memory values from resource settings. Args: - config: The Modal orchestrator config. resource_settings: The resource settings. Returns: - Tuple of (cpu_count, memory_mb) with config fallbacks. + Tuple of (cpu_count, memory_mb). """ - # Prefer pipeline resource settings, fallback to config defaults - cpu_count_raw = resource_settings.cpu_count or config.cpu_count + # Get CPU count cpu_count: Optional[int] = None - if cpu_count_raw is not None: - cpu_count = int(cpu_count_raw) + if resource_settings.cpu_count is not None: + cpu_count = int(resource_settings.cpu_count) # Convert memory to MB if needed - memory_mb: Optional[int] = config.memory_mb + memory_mb: Optional[int] = None if resource_settings.memory: memory_value = resource_settings.get_memory(ByteUnit.MB) if memory_value is not None: @@ -251,14 +248,12 @@ def get_or_deploy_persistent_modal_app( mode_suffix = execution_mode.replace("_", "-") # Create a 2-hour timestamp window (rounds down to nearest 2-hour boundary) - import time - current_time = int(time.time()) two_hour_window = current_time // (2 * 3600) # 2 hours = 7200 seconds app_name = f"zenml-{pipeline_name.replace('_', '-')}-{mode_suffix}-{two_hour_window}" - logger.info(f"🏗️ Getting/deploying persistent Modal app: {app_name}") + logger.info(f"Getting/deploying persistent Modal app: {app_name}") # Create the app app = modal.App(app_name) @@ -269,11 +264,11 @@ def get_or_deploy_persistent_modal_app( # Create the execution function based on execution mode if execution_mode == "per_step": - logger.info("🔧 Creating per-step mode for granular execution") + logger.debug("Creating per-step mode for granular execution") execution_func: Any = run_step_in_modal function_name = "run_step_in_modal" else: - logger.info("🚀 Creating pipeline mode for MAXIMUM SPEED!") + logger.debug("Creating pipeline mode for maximum speed") execution_func = run_entire_pipeline function_name = "run_entire_pipeline" @@ -291,8 +286,8 @@ def get_or_deploy_persistent_modal_app( # Try to lookup existing app in current 2-hour window, deploy if not found try: - logger.info( - f"🔍 Checking for Modal app in current 2-hour window: {app_name}" + logger.debug( + f"Checking for Modal app in current 2-hour window: {app_name}" ) try: @@ -300,7 +295,7 @@ def get_or_deploy_persistent_modal_app( app_name, environment_name=environment_name or "main" ) logger.info( - f"♻️ Found existing app '{app_name}' with fresh tokens - reusing warm containers!" + f"Found existing app '{app_name}' with fresh tokens - reusing warm containers" ) # Try to get the function directly @@ -310,40 +305,37 @@ def get_or_deploy_persistent_modal_app( function_name, environment_name=environment_name or "main", ) - logger.info( - "✅ Successfully retrieved function from existing app!" + logger.debug( + "Successfully retrieved function from existing app" ) return existing_function except Exception as func_error: logger.warning( - f"⚠️ Function lookup failed: {func_error}, redeploying..." + f"Function lookup failed: {func_error}, redeploying" ) # Fall through to deployment except Exception: # App not found or other lookup error - deploy fresh app - logger.info( - "🆕 No app found for current 2-hour window, deploying fresh app..." + logger.debug( + "No app found for current 2-hour window, deploying fresh app" ) # Deploy the app app.deploy(name=app_name, environment_name=environment_name or "main") logger.info( - f"✅ App '{app_name}' deployed with fresh tokens and {effective_min_containers} warm containers" + f"App '{app_name}' deployed with {effective_min_containers} warm containers" ) logger.info( - f"📱 View real-time logs at: https://modal.com/apps/{app_name}" + f"View real-time logs at: https://modal.com/apps/{app_name}" ) except Exception as e: - logger.error(f"❌ Deployment failed: {e}") + logger.error(f"Deployment failed: {e}") raise logger.info( - f"🔥 Modal app configured for SPEED with min_containers={effective_min_containers}, max_containers={effective_max_containers}" - ) - logger.info( - f"💡 This means {effective_min_containers} containers will stay warm for faster execution!" + f"Modal app configured with min_containers={effective_min_containers}, max_containers={effective_max_containers}" ) return execute_step_func @@ -564,57 +556,77 @@ def prepare_or_run_pipeline( orchestrator_run_id = str(uuid4()) environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id - # Get settings from the first step (all steps use same Modal resources) - first_step = list(deployment.step_configurations.values())[0] + # Get settings from pipeline configuration (applies to entire pipeline) settings = cast( - "ModalOrchestratorSettings", self.get_settings(first_step) + "ModalOrchestratorSettings", self.get_settings(deployment) ) - resource_settings = first_step.config.resource_settings + + # Get resource settings from pipeline configuration + + pipeline_resource_settings: Union[Dict[str, Any], Any] = ( + deployment.pipeline_configuration.settings.get( + RESOURCE_SETTINGS_KEY, {} + ) + ) + if pipeline_resource_settings: + # Convert to dict if it's a BaseSettings instance + if hasattr(pipeline_resource_settings, "model_dump"): + pipeline_resource_dict = ( + pipeline_resource_settings.model_dump() + ) + else: + pipeline_resource_dict = pipeline_resource_settings + resource_settings = ResourceSettings.model_validate( + pipeline_resource_dict + ) + else: + # Fallback to first step's resource settings if no pipeline-level resources + if deployment.step_configurations: + first_step = list(deployment.step_configurations.values())[0] + resource_settings = first_step.config.resource_settings + else: + resource_settings = ( + ResourceSettings() + ) # Default empty settings # Build Modal image zenml_image = self._build_modal_image(deployment, stack, environment) - # Configure resources with config fallbacks + # Configure resources from resource settings gpu_values = get_gpu_values(settings, resource_settings) - cpu_count, memory_mb = get_resource_values( - self.config, resource_settings - ) + cpu_count, memory_mb = get_resource_values(resource_settings) start_time = time.time() - # Execute steps using Modal's fast container spin-up with PERSISTENT app + # Execute steps using Modal's fast container spin-up with persistent app logger.info( - "🚀 Starting pipeline execution with PERSISTENT Modal functions..." + "Starting pipeline execution with persistent Modal functions" ) step_names = list(deployment.step_configurations.keys()) - logger.info(f"📋 Found {len(step_names)} steps: {step_names}") + logger.debug(f"Found {len(step_names)} steps: {step_names}") - # Get or deploy persistent Modal app with BLAZING FAST warm containers + # Get or deploy persistent Modal app with warm containers execute_step = get_or_deploy_persistent_modal_app( pipeline_name=deployment.pipeline_configuration.name, zenml_image=zenml_image, gpu_values=gpu_values, - cpu_count=cpu_count or 8, # Default to 8 CPU cores for speed - memory_mb=memory_mb or 16384, # Default to 16GB RAM for speed + cpu_count=cpu_count, # Use ResourceSettings value or None (Modal default) + memory_mb=memory_mb, # Use ResourceSettings value or None (Modal default) cloud=settings.cloud or self.config.cloud, region=settings.region or self.config.region, timeout=settings.timeout or self.config.timeout, min_containers=settings.min_containers - or self.config.min_containers - or 1, # Keep 1 warm container for sequential execution + or self.config.min_containers, max_containers=settings.max_containers - or self.config.max_containers - or 10, # Scale to 10 containers + or self.config.max_containers, environment_name=settings.environment or self.config.environment, # Use environment from config/settings execution_mode=settings.execution_mode or self.config.execution_mode, # Use execution mode from settings ) - logger.info( - "⚡ Executing with DEPLOYED Modal app and warm containers..." - ) + logger.info("Executing with deployed Modal app and warm containers") # Execute based on execution mode with improved Modal Function API usage execution_mode = settings.execution_mode or self.config.execution_mode @@ -636,27 +648,27 @@ def execute_modal_function( Returns: Result of the Modal function execution. """ - logger.info(f"🚀 {description}") + logger.info(f"Starting {description}") if sync_execution: - logger.info("⚡ Using .remote() for synchronous execution") + logger.debug("Using .remote() for synchronous execution") # .remote() waits for completion but doesn't stream logs result = execute_step.remote(*func_args) - logger.info(f"✅ {description} completed successfully!") + logger.info(f"{description} completed successfully") return result else: - logger.info( - "🔥 Using .spawn() for asynchronous fire-and-forget execution" + logger.debug( + "Using .spawn() for asynchronous fire-and-forget execution" ) # .spawn() for fire-and-forget (async) function_call = execute_step.spawn(*func_args) logger.info( - f"🚀 {description} started asynchronously (not waiting for completion)" + f"{description} started asynchronously (not waiting for completion)" ) return function_call if execution_mode == "per_step": - logger.info("🔧 Using per-step mode for granular execution...") + logger.info("Using per-step mode for granular execution") # Execute steps individually for step_name in step_names: try: @@ -665,19 +677,19 @@ def execute_modal_function( f"Step '{step_name}' execution", ) except Exception as e: - logger.error(f"❌ Step '{step_name}' failed: {e}") - logger.error("💡 Check Modal dashboard for detailed logs") + logger.error(f"Step '{step_name}' failed: {e}") + logger.info("Check Modal dashboard for detailed logs") raise else: # Default: execute entire pipeline in one function try: execute_modal_function( (deployment.id, orchestrator_run_id), - "Pipeline execution (MAXIMUM SPEED)", + "Pipeline execution", ) except Exception as e: - logger.error(f"❌ Pipeline failed: {e}") - logger.error("💡 Check Modal dashboard for detailed logs") + logger.error(f"Pipeline failed: {e}") + logger.info("Check Modal dashboard for detailed logs") raise run_duration = time.time() - start_time From 58d72c5bdb5084cc98b0d8bd8f5bf048c1c49ae6 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 12:16:36 +0200 Subject: [PATCH 04/77] Add Modal step operator orchestrator to run pipelines on Modal --- .../flavors/modal_step_operator_flavor.py | 22 +- .../modal/orchestrators/modal_orchestrator.py | 333 ++---------------- .../step_operators/modal_step_operator.py | 108 +----- 3 files changed, 76 insertions(+), 387 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py index a66580faa54..ead81a08199 100644 --- a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py @@ -15,6 +15,8 @@ from typing import TYPE_CHECKING, Optional, Type +from pydantic import SecretStr + from zenml.config.base_settings import BaseSettings from zenml.integrations.modal import MODAL_STEP_OPERATOR_FLAVOR from zenml.step_operators import BaseStepOperatorConfig, BaseStepOperatorFlavor @@ -36,20 +38,36 @@ class ModalStepOperatorSettings(BaseSettings): incompatible. See more in the Modal docs at https://modal.com/docs/guide/region-selection. Attributes: - gpu: The type of GPU to use for the step execution. + gpu: The type of GPU to use for the step execution (e.g., "T4", "A100"). + Use ResourceSettings.gpu_count to specify the number of GPUs. region: The region to use for the step execution. cloud: The cloud provider to use for the step execution. + environment: The Modal environment to use for the step execution. + timeout: Maximum execution time in seconds (default 24h). """ gpu: Optional[str] = None region: Optional[str] = None cloud: Optional[str] = None + environment: Optional[str] = None + timeout: int = 86400 # 24 hours (Modal's maximum) class ModalStepOperatorConfig( BaseStepOperatorConfig, ModalStepOperatorSettings ): - """Configuration for the Modal step operator.""" + """Configuration for the Modal step operator. + + Attributes: + token: Modal API token for authentication. If not provided, + falls back to Modal's default authentication (~/.modal.toml). + workspace: Modal workspace name (optional). + environment: Modal environment name (optional). + """ + + token: Optional[SecretStr] = None + workspace: Optional[str] = None + environment: Optional[str] = None @property def is_remote(self) -> bool: diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index dd0465e0b2d..3368d70b592 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -25,7 +25,6 @@ Optional, Tuple, Type, - Union, cast, ) from uuid import uuid4 @@ -35,18 +34,25 @@ except ImportError: modal = None # type: ignore -from zenml.config import ResourceSettings from zenml.config.base_settings import BaseSettings from zenml.config.build_configuration import BuildConfiguration from zenml.config.constants import RESOURCE_SETTINGS_KEY -from zenml.config.resource_settings import ByteUnit from zenml.entrypoints.pipeline_entrypoint_configuration import ( PipelineEntrypointConfiguration, ) from zenml.entrypoints.step_entrypoint_configuration import ( StepEntrypointConfiguration, ) -from zenml.enums import StackComponentType +from zenml.integrations.modal.utils import ( + ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, + build_modal_image, + create_modal_stack_validator, + get_gpu_values, + get_or_deploy_persistent_modal_app, + get_resource_settings_from_deployment, + get_resource_values, + setup_modal_client, +) from zenml.logger import get_logger from zenml.orchestrators import ContainerizedOrchestrator from zenml.stack import Stack, StackValidator @@ -62,8 +68,6 @@ logger = get_logger(__name__) -ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID = "ZENML_MODAL_ORCHESTRATOR_RUN_ID" - def run_step_in_modal( step_name: str, @@ -159,188 +163,6 @@ def run_entire_pipeline( raise -def get_gpu_values( - settings: "ModalOrchestratorSettings", resource_settings: ResourceSettings -) -> Optional[str]: - """Get the GPU values for the Modal orchestrator. - - Args: - settings: The Modal orchestrator settings. - resource_settings: The resource settings. - - Returns: - The GPU string if a count is specified, otherwise the GPU type. - """ - if not settings.gpu: - return None - # Prefer resource_settings gpu_count, fallback to 1 - gpu_count = resource_settings.gpu_count or 1 - return f"{settings.gpu}:{gpu_count}" if gpu_count > 1 else settings.gpu - - -def get_resource_values( - resource_settings: ResourceSettings, -) -> Tuple[Optional[int], Optional[int]]: - """Get CPU and memory values from resource settings. - - Args: - resource_settings: The resource settings. - - Returns: - Tuple of (cpu_count, memory_mb). - """ - # Get CPU count - cpu_count: Optional[int] = None - if resource_settings.cpu_count is not None: - cpu_count = int(resource_settings.cpu_count) - - # Convert memory to MB if needed - memory_mb: Optional[int] = None - if resource_settings.memory: - memory_value = resource_settings.get_memory(ByteUnit.MB) - if memory_value is not None: - memory_mb = int(memory_value) - - return cpu_count, memory_mb - - -def get_or_deploy_persistent_modal_app( - pipeline_name: str, - zenml_image: Any, - gpu_values: Optional[str], - cpu_count: Optional[int], - memory_mb: Optional[int], - cloud: Optional[str], - region: Optional[str], - timeout: int, - min_containers: Optional[int], - max_containers: Optional[int], - environment_name: Optional[str] = None, - execution_mode: str = "single_function", -) -> Any: - """Get or deploy a persistent Modal app with warm containers. - - This function deploys a Modal app that stays alive with warm containers - for maximum speed between pipeline runs. - - Args: - pipeline_name: Name of the pipeline. - zenml_image: Pre-built ZenML Docker image for Modal. - gpu_values: GPU configuration string. - cpu_count: Number of CPU cores. - memory_mb: Memory allocation in MB. - cloud: Cloud provider to use. - region: Region to deploy in. - timeout: Maximum execution timeout. - min_containers: Minimum containers to keep warm. - max_containers: Maximum containers to scale to. - environment_name: Modal environment name. - execution_mode: Execution mode for the function. - - Returns: - The Modal function ready for execution. - - Raises: - Exception: If deployment fails. - """ - # Use pipeline name + execution mode + 2-hour window for app reuse - # This ensures apps get redeployed every 2 hours to refresh tokens - mode_suffix = execution_mode.replace("_", "-") - - # Create a 2-hour timestamp window (rounds down to nearest 2-hour boundary) - current_time = int(time.time()) - two_hour_window = current_time // (2 * 3600) # 2 hours = 7200 seconds - - app_name = f"zenml-{pipeline_name.replace('_', '-')}-{mode_suffix}-{two_hour_window}" - - logger.info(f"Getting/deploying persistent Modal app: {app_name}") - - # Create the app - app = modal.App(app_name) - - # Ensure we have minimum containers for fast startup - effective_min_containers = min_containers or 1 - effective_max_containers = max_containers or 10 - - # Create the execution function based on execution mode - if execution_mode == "per_step": - logger.debug("Creating per-step mode for granular execution") - execution_func: Any = run_step_in_modal - function_name = "run_step_in_modal" - else: - logger.debug("Creating pipeline mode for maximum speed") - execution_func = run_entire_pipeline - function_name = "run_entire_pipeline" - - execute_step_func = app.function( - image=zenml_image, - gpu=gpu_values, - cpu=cpu_count, - memory=memory_mb, - cloud=cloud, - region=region, - timeout=timeout, - min_containers=effective_min_containers, # Keep containers warm for speed - max_containers=effective_max_containers, # Allow scaling - )(execution_func) - - # Try to lookup existing app in current 2-hour window, deploy if not found - try: - logger.debug( - f"Checking for Modal app in current 2-hour window: {app_name}" - ) - - try: - modal.App.lookup( - app_name, environment_name=environment_name or "main" - ) - logger.info( - f"Found existing app '{app_name}' with fresh tokens - reusing warm containers" - ) - - # Try to get the function directly - try: - existing_function = modal.Function.from_name( - app_name, - function_name, - environment_name=environment_name or "main", - ) - logger.debug( - "Successfully retrieved function from existing app" - ) - return existing_function - except Exception as func_error: - logger.warning( - f"Function lookup failed: {func_error}, redeploying" - ) - # Fall through to deployment - - except Exception: - # App not found or other lookup error - deploy fresh app - logger.debug( - "No app found for current 2-hour window, deploying fresh app" - ) - - # Deploy the app - app.deploy(name=app_name, environment_name=environment_name or "main") - logger.info( - f"App '{app_name}' deployed with {effective_min_containers} warm containers" - ) - logger.info( - f"View real-time logs at: https://modal.com/apps/{app_name}" - ) - - except Exception as e: - logger.error(f"Deployment failed: {e}") - raise - - logger.info( - f"Modal app configured with min_containers={effective_min_containers}, max_containers={effective_max_containers}" - ) - - return execute_step_func - - class ModalOrchestrator(ContainerizedOrchestrator): """Orchestrator responsible for running entire pipelines on Modal. @@ -373,18 +195,11 @@ def settings_class(self) -> Optional[Type["BaseSettings"]]: def _setup_modal_client(self) -> None: """Setup Modal client with authentication.""" - if self.config.token: - # Set Modal token from config - os.environ["MODAL_TOKEN_ID"] = self.config.token.get_secret_value() - logger.info("Using Modal token from orchestrator config") - else: - logger.info("Using default Modal authentication (~/.modal.toml)") - - # Set workspace/environment if provided - if self.config.workspace: - os.environ["MODAL_WORKSPACE"] = self.config.workspace - if self.config.environment: - os.environ["MODAL_ENVIRONMENT"] = self.config.environment + setup_modal_client( + token=self.config.token, + workspace=self.config.workspace, + environment=self.config.environment, + ) @property def validator(self) -> Optional[StackValidator]: @@ -393,40 +208,7 @@ def validator(self) -> Optional[StackValidator]: Returns: A `StackValidator` instance. """ - - def _validate_remote_components(stack: "Stack") -> tuple[bool, str]: - if stack.artifact_store.config.is_local: - return False, ( - "The Modal orchestrator runs code remotely and " - "needs to write files into the artifact store, but the " - f"artifact store `{stack.artifact_store.name}` of the " - "active stack is local. Please ensure that your stack " - "contains a remote artifact store when using the Modal " - "orchestrator." - ) - - container_registry = stack.container_registry - assert container_registry is not None - - if container_registry.config.is_local: - return False, ( - "The Modal orchestrator runs code remotely and " - "needs to push/pull Docker images, but the " - f"container registry `{container_registry.name}` of the " - "active stack is local. Please ensure that your stack " - "contains a remote container registry when using the " - "Modal orchestrator." - ) - - return True, "" - - return StackValidator( - required_components={ - StackComponentType.CONTAINER_REGISTRY, - StackComponentType.IMAGE_BUILDER, - }, - custom_validation_function=_validate_remote_components, - ) + return create_modal_stack_validator() def get_orchestrator_run_id(self) -> str: """Returns the active orchestrator run id. @@ -484,38 +266,7 @@ def _build_modal_image( # Get the ZenML-built image that contains all pipeline code image_name = self.get_image(deployment=deployment) - if not stack.container_registry: - raise ValueError( - "No Container registry found in the stack. " - "Please add a container registry and ensure " - "it is correctly configured." - ) - - if docker_creds := stack.container_registry.credentials: - docker_username, docker_password = docker_creds - else: - raise RuntimeError( - "No Docker credentials found for the container registry." - ) - - # Create Modal secret for registry authentication - registry_secret = modal.Secret.from_dict( - { - "REGISTRY_USERNAME": docker_username, - "REGISTRY_PASSWORD": docker_password, - } - ) - - # Build Modal image from the ZenML-built image - # Use from_registry to pull the ZenML image with authentication - # and install Modal dependencies - zenml_image = ( - modal.Image.from_registry(image_name, secret=registry_secret) - .pip_install("modal") # Install Modal in the container - .env(environment) - ) - - return zenml_image + return build_modal_image(image_name, stack, environment) def prepare_or_run_pipeline( self, @@ -562,38 +313,15 @@ def prepare_or_run_pipeline( ) # Get resource settings from pipeline configuration - - pipeline_resource_settings: Union[Dict[str, Any], Any] = ( - deployment.pipeline_configuration.settings.get( - RESOURCE_SETTINGS_KEY, {} - ) + resource_settings = get_resource_settings_from_deployment( + deployment, RESOURCE_SETTINGS_KEY ) - if pipeline_resource_settings: - # Convert to dict if it's a BaseSettings instance - if hasattr(pipeline_resource_settings, "model_dump"): - pipeline_resource_dict = ( - pipeline_resource_settings.model_dump() - ) - else: - pipeline_resource_dict = pipeline_resource_settings - resource_settings = ResourceSettings.model_validate( - pipeline_resource_dict - ) - else: - # Fallback to first step's resource settings if no pipeline-level resources - if deployment.step_configurations: - first_step = list(deployment.step_configurations.values())[0] - resource_settings = first_step.config.resource_settings - else: - resource_settings = ( - ResourceSettings() - ) # Default empty settings # Build Modal image zenml_image = self._build_modal_image(deployment, stack, environment) # Configure resources from resource settings - gpu_values = get_gpu_values(settings, resource_settings) + gpu_values = get_gpu_values(settings.gpu, resource_settings) cpu_count, memory_mb = get_resource_values(resource_settings) start_time = time.time() @@ -606,10 +334,26 @@ def prepare_or_run_pipeline( step_names = list(deployment.step_configurations.keys()) logger.debug(f"Found {len(step_names)} steps: {step_names}") + # Create the execution function based on execution mode + execution_mode = settings.execution_mode or self.config.execution_mode + if execution_mode == "per_step": + logger.debug("Creating per-step mode for granular execution") + execution_func: Any = run_step_in_modal + function_name = "run_step_in_modal" + else: + logger.debug("Creating pipeline mode for maximum speed") + execution_func = run_entire_pipeline + function_name = "run_entire_pipeline" + # Get or deploy persistent Modal app with warm containers + mode_suffix = execution_mode.replace("_", "-") + app_name_base = f"zenml-{deployment.pipeline_configuration.name.replace('_', '-')}-{mode_suffix}" + execute_step = get_or_deploy_persistent_modal_app( - pipeline_name=deployment.pipeline_configuration.name, + app_name_base=app_name_base, zenml_image=zenml_image, + execution_func=execution_func, + function_name=function_name, gpu_values=gpu_values, cpu_count=cpu_count, # Use ResourceSettings value or None (Modal default) memory_mb=memory_mb, # Use ResourceSettings value or None (Modal default) @@ -622,14 +366,11 @@ def prepare_or_run_pipeline( or self.config.max_containers, environment_name=settings.environment or self.config.environment, # Use environment from config/settings - execution_mode=settings.execution_mode - or self.config.execution_mode, # Use execution mode from settings ) logger.info("Executing with deployed Modal app and warm containers") # Execute based on execution mode with improved Modal Function API usage - execution_mode = settings.execution_mode or self.config.execution_mode sync_execution = ( settings.synchronous if hasattr(settings, "synchronous") diff --git a/src/zenml/integrations/modal/step_operators/modal_step_operator.py b/src/zenml/integrations/modal/step_operators/modal_step_operator.py index 4b062feaea2..410655697ad 100644 --- a/src/zenml/integrations/modal/step_operators/modal_step_operator.py +++ b/src/zenml/integrations/modal/step_operators/modal_step_operator.py @@ -14,21 +14,25 @@ """Modal step operator implementation.""" import asyncio -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, cast +from typing import TYPE_CHECKING, Dict, List, Optional, Type, cast import modal -from modal_proto import api_pb2 from zenml.client import Client from zenml.config.build_configuration import BuildConfiguration -from zenml.config.resource_settings import ByteUnit, ResourceSettings -from zenml.enums import StackComponentType +from zenml.config.resource_settings import ByteUnit from zenml.integrations.modal.flavors import ( ModalStepOperatorConfig, ModalStepOperatorSettings, ) +from zenml.integrations.modal.utils import ( + build_modal_image, + create_modal_stack_validator, + get_gpu_values, + setup_modal_client, +) from zenml.logger import get_logger -from zenml.stack import Stack, StackValidator +from zenml.stack import StackValidator from zenml.step_operators import BaseStepOperator if TYPE_CHECKING: @@ -41,24 +45,6 @@ MODAL_STEP_OPERATOR_DOCKER_IMAGE_KEY = "modal_step_operator" -def get_gpu_values( - settings: ModalStepOperatorSettings, resource_settings: ResourceSettings -) -> Optional[str]: - """Get the GPU values for the Modal step operator. - - Args: - settings: The Modal step operator settings. - resource_settings: The resource settings. - - Returns: - The GPU string if a count is specified, otherwise the GPU type. - """ - if not settings.gpu: - return None - gpu_count = resource_settings.gpu_count - return f"{settings.gpu}:{gpu_count}" if gpu_count else settings.gpu - - class ModalStepOperator(BaseStepOperator): """Step operator to run a step on Modal. @@ -91,40 +77,7 @@ def validator(self) -> Optional[StackValidator]: Returns: The stack validator. """ - - def _validate_remote_components(stack: "Stack") -> Tuple[bool, str]: - if stack.artifact_store.config.is_local: - return False, ( - "The Modal step operator runs code remotely and " - "needs to write files into the artifact store, but the " - f"artifact store `{stack.artifact_store.name}` of the " - "active stack is local. Please ensure that your stack " - "contains a remote artifact store when using the Modal " - "step operator." - ) - - container_registry = stack.container_registry - assert container_registry is not None - - if container_registry.config.is_local: - return False, ( - "The Modal step operator runs code remotely and " - "needs to push/pull Docker images, but the " - f"container registry `{container_registry.name}` of the " - "active stack is local. Please ensure that your stack " - "contains a remote container registry when using the " - "Modal step operator." - ) - - return True, "" - - return StackValidator( - required_components={ - StackComponentType.CONTAINER_REGISTRY, - StackComponentType.IMAGE_BUILDER, - }, - custom_validation_function=_validate_remote_components, - ) + return create_modal_stack_validator() def get_docker_builds( self, deployment: "PipelineDeploymentBase" @@ -171,41 +124,18 @@ def launch( zc = Client() stack = zc.active_stack - if not stack.container_registry: - raise ValueError( - "No Container registry found in the stack. " - "Please add a container registry and ensure " - "it is correctly configured." - ) - - if docker_creds := stack.container_registry.credentials: - docker_username, docker_password = docker_creds - else: - raise RuntimeError( - "No Docker credentials found for the container registry." - ) - - my_secret = modal.secret._Secret.from_dict( - { - "REGISTRY_USERNAME": docker_username, - "REGISTRY_PASSWORD": docker_password, - } - ) - - spec = modal.image.DockerfileSpec( - commands=[f"FROM {image_name}"], context_files={} + # Setup Modal authentication + setup_modal_client( + token=self.config.token, + workspace=self.config.workspace, + environment=self.config.environment, ) - zenml_image = modal.Image._from_args( - dockerfile_function=lambda *_, **__: spec, - force_build=False, - image_registry_config=modal.image._ImageRegistryConfig( - api_pb2.REGISTRY_AUTH_TYPE_STATIC_CREDS, my_secret - ), - ).env(environment) + # Build Modal image using shared utility + zenml_image = build_modal_image(image_name, stack, environment) resource_settings = info.config.resource_settings - gpu_values = get_gpu_values(settings, resource_settings) + gpu_values = get_gpu_values(settings.gpu, resource_settings) app = modal.App( f"zenml-{info.run_name}-{info.step_run_id}-{info.pipeline_step_name}" @@ -231,7 +161,7 @@ async def run_sandbox() -> asyncio.Future[None]: cloud=settings.cloud, region=settings.region, app=app, - timeout=86400, # 24h, the max Modal allows + timeout=settings.timeout, ) await sb.wait.aio() From 9a0f2a5269e771953b2a1f2cbf222ea2b0390da2 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 12:28:16 +0200 Subject: [PATCH 05/77] Add log streaming for async execution in Modal Orchestrator --- .../modal/orchestrators/modal_orchestrator.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 3368d70b592..40e59faeca5 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -52,6 +52,7 @@ get_resource_settings_from_deployment, get_resource_values, setup_modal_client, + stream_modal_logs_and_wait, ) from zenml.logger import get_logger from zenml.orchestrators import ContainerizedOrchestrator @@ -349,7 +350,7 @@ def prepare_or_run_pipeline( mode_suffix = execution_mode.replace("_", "-") app_name_base = f"zenml-{deployment.pipeline_configuration.name.replace('_', '-')}-{mode_suffix}" - execute_step = get_or_deploy_persistent_modal_app( + execute_step, full_app_name = get_or_deploy_persistent_modal_app( app_name_base=app_name_base, zenml_image=zenml_image, execution_func=execution_func, @@ -380,7 +381,7 @@ def prepare_or_run_pipeline( def execute_modal_function( func_args: Tuple[Any, ...], description: str ) -> Any: - """Execute Modal function with proper sync/async control. + """Execute Modal function with proper sync/async control and log streaming. Args: func_args: Arguments to pass to the Modal function. @@ -389,20 +390,17 @@ def execute_modal_function( Returns: Result of the Modal function execution. """ - logger.info(f"Starting {description}") + # Always use .spawn() to get a FunctionCall object for log streaming + function_call = execute_step.spawn(*func_args) if sync_execution: - logger.debug("Using .remote() for synchronous execution") - # .remote() waits for completion but doesn't stream logs - result = execute_step.remote(*func_args) - logger.info(f"{description} completed successfully") - return result - else: - logger.debug( - "Using .spawn() for asynchronous fire-and-forget execution" + logger.debug("Using synchronous execution with log streaming") + # Stream logs while waiting for completion using app name + return stream_modal_logs_and_wait( + function_call, description, full_app_name ) - # .spawn() for fire-and-forget (async) - function_call = execute_step.spawn(*func_args) + else: + logger.debug("Using asynchronous fire-and-forget execution") logger.info( f"{description} started asynchronously (not waiting for completion)" ) From 7cf8fd5adfa076287b262fde82efce981a697960 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 12:33:03 +0200 Subject: [PATCH 06/77] Add app warming window hours for container reuse --- .../integrations/modal/flavors/modal_orchestrator_flavor.py | 5 +++++ .../integrations/modal/orchestrators/modal_orchestrator.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 9d0ebc6a6e5..82e9a0f9f0e 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -40,6 +40,8 @@ class ModalOrchestratorSettings(BaseSettings): max_containers: Maximum concurrent containers (replaces concurrency_limit). execution_mode: Execution mode - "pipeline" (default, fastest) or "per_step" (granular control). synchronous: Wait for completion (True) or fire-and-forget (False). + app_warming_window_hours: Hours for app name window to enable container reuse. + Apps are reused within this time window for efficiency. Default 2 hours. """ gpu: Optional[str] = None @@ -55,6 +57,9 @@ class ModalOrchestratorSettings(BaseSettings): synchronous: bool = ( True # Wait for completion (True) or fire-and-forget (False) ) + app_warming_window_hours: float = ( + 2.0 # Default 2-hour window for app reuse + ) class ModalOrchestratorConfig( diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 40e59faeca5..07b9a93a895 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -367,6 +367,8 @@ def prepare_or_run_pipeline( or self.config.max_containers, environment_name=settings.environment or self.config.environment, # Use environment from config/settings + app_warming_window_hours=settings.app_warming_window_hours + or self.config.app_warming_window_hours, ) logger.info("Executing with deployed Modal app and warm containers") From 60d227a95bb2f6b61fc031b292ce392ba0d4d4e2 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 13:41:36 +0200 Subject: [PATCH 07/77] Remove unnecessary exception handling in ModalOrchestrator and ModalStepOperator --- .../integrations/modal/orchestrators/modal_orchestrator.py | 4 ---- .../integrations/modal/step_operators/modal_step_operator.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 07b9a93a895..141421561a6 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -259,10 +259,6 @@ def _build_modal_image( Returns: The configured Modal image. - - Raises: - RuntimeError: If no Docker credentials are found. - ValueError: If no container registry is found. """ # Get the ZenML-built image that contains all pipeline code image_name = self.get_image(deployment=deployment) diff --git a/src/zenml/integrations/modal/step_operators/modal_step_operator.py b/src/zenml/integrations/modal/step_operators/modal_step_operator.py index 410655697ad..12307158be4 100644 --- a/src/zenml/integrations/modal/step_operators/modal_step_operator.py +++ b/src/zenml/integrations/modal/step_operators/modal_step_operator.py @@ -114,10 +114,6 @@ def launch( info: The step run information. entrypoint_command: The entrypoint command for the step. environment: The environment variables for the step. - - Raises: - RuntimeError: If no Docker credentials are found for the container registry. - ValueError: If no container registry is found in the stack. """ settings = cast(ModalStepOperatorSettings, self.get_settings(info)) image_name = info.get_image(key=MODAL_STEP_OPERATOR_DOCKER_IMAGE_KEY) From 45ed008b416f77a3981922a253e577dc1a15a071 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 13:54:14 +0200 Subject: [PATCH 08/77] Refactor return statement to assign and return validator --- .../integrations/modal/orchestrators/modal_orchestrator.py | 3 ++- .../integrations/modal/step_operators/modal_step_operator.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 141421561a6..457640438ab 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -209,7 +209,8 @@ def validator(self) -> Optional[StackValidator]: Returns: A `StackValidator` instance. """ - return create_modal_stack_validator() + validator: StackValidator = create_modal_stack_validator() + return validator def get_orchestrator_run_id(self) -> str: """Returns the active orchestrator run id. diff --git a/src/zenml/integrations/modal/step_operators/modal_step_operator.py b/src/zenml/integrations/modal/step_operators/modal_step_operator.py index 12307158be4..3ef448deded 100644 --- a/src/zenml/integrations/modal/step_operators/modal_step_operator.py +++ b/src/zenml/integrations/modal/step_operators/modal_step_operator.py @@ -77,7 +77,8 @@ def validator(self) -> Optional[StackValidator]: Returns: The stack validator. """ - return create_modal_stack_validator() + validator: StackValidator = create_modal_stack_validator() + return validator def get_docker_builds( self, deployment: "PipelineDeploymentBase" From 0b6b9f9102270d9fc59361aba48ccebd8760325d Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 14:20:07 +0200 Subject: [PATCH 09/77] Add Modal integration utils module for mypy discoverability --- src/zenml/integrations/modal/__init__.py | 4 + src/zenml/integrations/modal/utils.py | 483 +++++++++++++++++++++++ 2 files changed, 487 insertions(+) create mode 100644 src/zenml/integrations/modal/utils.py diff --git a/src/zenml/integrations/modal/__init__.py b/src/zenml/integrations/modal/__init__.py index 8feee188a15..c9a6597d53e 100644 --- a/src/zenml/integrations/modal/__init__.py +++ b/src/zenml/integrations/modal/__init__.py @@ -47,3 +47,7 @@ def flavors(cls) -> List[Type[Flavor]]: return [ModalOrchestratorFlavor, ModalStepOperatorFlavor] +# Import utils module to make it discoverable by mypy +from zenml.integrations.modal import utils # noqa: F401 + + diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py new file mode 100644 index 00000000000..bc2fb4721ca --- /dev/null +++ b/src/zenml/integrations/modal/utils.py @@ -0,0 +1,483 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Shared utilities for Modal integration components.""" + +import os +import subprocess +import threading +import time +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union + +try: + import modal +except ImportError: + modal = None # type: ignore + +from zenml.config import ResourceSettings +from zenml.config.resource_settings import ByteUnit +from zenml.enums import StackComponentType +from zenml.logger import get_logger +from zenml.stack import Stack, StackValidator + +if TYPE_CHECKING: + from pydantic import SecretStr + +logger = get_logger(__name__) + +# Common environment variable for Modal orchestrator run ID +ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID = "ZENML_MODAL_ORCHESTRATOR_RUN_ID" + + +def setup_modal_client( + token: Optional["SecretStr"] = None, + workspace: Optional[str] = None, + environment: Optional[str] = None, +) -> None: + """Setup Modal client with authentication. + + Args: + token: Modal API token for authentication. + workspace: Modal workspace name. + environment: Modal environment name. + """ + if token: + # Set Modal token from config + os.environ["MODAL_TOKEN_ID"] = token.get_secret_value() + logger.info("Using Modal token from config") + else: + logger.info("Using default Modal authentication (~/.modal.toml)") + + # Set workspace/environment if provided + if workspace: + os.environ["MODAL_WORKSPACE"] = workspace + if environment: + os.environ["MODAL_ENVIRONMENT"] = environment + + +def get_gpu_values( + gpu_type: Optional[str], resource_settings: ResourceSettings +) -> Optional[str]: + """Get the GPU values for Modal components. + + Args: + gpu_type: The GPU type (e.g., "T4", "A100"). + resource_settings: The resource settings. + + Returns: + The GPU string if a count is specified, otherwise the GPU type. + """ + if not gpu_type: + return None + # Prefer resource_settings gpu_count, fallback to 1 + gpu_count = resource_settings.gpu_count or 1 + return f"{gpu_type}:{gpu_count}" if gpu_count > 1 else gpu_type + + +def get_resource_values( + resource_settings: ResourceSettings, +) -> Tuple[Optional[int], Optional[int]]: + """Get CPU and memory values from resource settings. + + Args: + resource_settings: The resource settings. + + Returns: + Tuple of (cpu_count, memory_mb). + """ + # Get CPU count + cpu_count: Optional[int] = None + if resource_settings.cpu_count is not None: + cpu_count = int(resource_settings.cpu_count) + + # Convert memory to MB if needed + memory_mb: Optional[int] = None + if resource_settings.memory: + memory_value = resource_settings.get_memory(ByteUnit.MB) + if memory_value is not None: + memory_mb = int(memory_value) + + return cpu_count, memory_mb + + +def build_modal_image( + image_name: str, + stack: "Stack", + environment: Dict[str, str], +) -> Any: + """Build a Modal image from a ZenML-built Docker image. + + Args: + image_name: The name of the Docker image to use as base. + stack: The ZenML stack containing container registry. + environment: Environment variables to set in the image. + + Returns: + The configured Modal image. + + Raises: + RuntimeError: If no Docker credentials are found. + ValueError: If no container registry is found. + """ + if not stack.container_registry: + raise ValueError( + "No Container registry found in the stack. " + "Please add a container registry and ensure " + "it is correctly configured." + ) + + if docker_creds := stack.container_registry.credentials: + docker_username, docker_password = docker_creds + else: + raise RuntimeError( + "No Docker credentials found for the container registry." + ) + + # Create Modal secret for registry authentication + registry_secret = modal.Secret.from_dict( + { + "REGISTRY_USERNAME": docker_username, + "REGISTRY_PASSWORD": docker_password, + } + ) + + # Build Modal image from the ZenML-built image + # Use from_registry to pull the ZenML image with authentication + # and install Modal dependencies + zenml_image = ( + modal.Image.from_registry(image_name, secret=registry_secret) + .pip_install("modal") # Install Modal in the container + .env(environment) + ) + + return zenml_image + + +def create_modal_stack_validator() -> StackValidator: + """Create a stack validator for Modal components. + + Returns: + A StackValidator that ensures remote artifact store and container registry. + """ + + def _validate_remote_components(stack: "Stack") -> Tuple[bool, str]: + if stack.artifact_store.config.is_local: + return False, ( + "Modal components run code remotely and " + "need to write files into the artifact store, but the " + f"artifact store `{stack.artifact_store.name}` of the " + "active stack is local. Please ensure that your stack " + "contains a remote artifact store when using Modal " + "components." + ) + + container_registry = stack.container_registry + assert container_registry is not None + + if container_registry.config.is_local: + return False, ( + "Modal components run code remotely and " + "need to push/pull Docker images, but the " + f"container registry `{container_registry.name}` of the " + "active stack is local. Please ensure that your stack " + "contains a remote container registry when using Modal " + "components." + ) + + return True, "" + + return StackValidator( + required_components={ + StackComponentType.CONTAINER_REGISTRY, + StackComponentType.IMAGE_BUILDER, + }, + custom_validation_function=_validate_remote_components, + ) + + +def get_or_deploy_persistent_modal_app( + app_name_base: str, + zenml_image: Any, + execution_func: Any, + function_name: str, + gpu_values: Optional[str] = None, + cpu_count: Optional[int] = None, + memory_mb: Optional[int] = None, + cloud: Optional[str] = None, + region: Optional[str] = None, + timeout: int = 86400, + min_containers: Optional[int] = None, + max_containers: Optional[int] = None, + environment_name: Optional[str] = None, + app_warming_window_hours: float = 2.0, +) -> Tuple[Any, str]: + """Get or deploy a persistent Modal app with warm containers. + + This function deploys a Modal app that stays alive with warm containers + for maximum speed between runs. + + Args: + app_name_base: Base name for the app (will be suffixed with timestamp). + zenml_image: Pre-built ZenML Docker image for Modal. + execution_func: The function to execute in the Modal app. + function_name: Name of the function in the app. + gpu_values: GPU configuration string. + cpu_count: Number of CPU cores. + memory_mb: Memory allocation in MB. + cloud: Cloud provider to use. + region: Region to deploy in. + timeout: Maximum execution timeout. + min_containers: Minimum containers to keep warm. + max_containers: Maximum containers to scale to. + environment_name: Modal environment name. + app_warming_window_hours: Hours for app name window to enable reuse. + + Returns: + Tuple of (Modal function ready for execution, full app name). + + Raises: + Exception: If deployment fails. + """ + # Create timestamp window for app reuse (rounds down to nearest window boundary) + current_time = int(time.time()) + window_seconds = int( + app_warming_window_hours * 3600 + ) # Convert hours to seconds + time_window = current_time // window_seconds + + app_name = f"{app_name_base}-{time_window}" + + logger.info(f"Getting/deploying persistent Modal app: {app_name}") + + # Create the app + app = modal.App(app_name) + + # Ensure we have minimum containers for fast startup + effective_min_containers = min_containers or 1 + effective_max_containers = max_containers or 10 + + execute_step_func = app.function( + image=zenml_image, + gpu=gpu_values, + cpu=cpu_count, + memory=memory_mb, + cloud=cloud, + region=region, + timeout=timeout, + min_containers=effective_min_containers, # Keep containers warm for speed + max_containers=effective_max_containers, # Allow scaling + )(execution_func) + + # Try to lookup existing app in current time window, deploy if not found + try: + logger.debug( + f"Checking for Modal app in current time window: {app_name}" + ) + + try: + modal.App.lookup( + app_name, environment_name=environment_name or "main" + ) + logger.info( + f"Found existing app '{app_name}' with fresh tokens - reusing warm containers" + ) + + # Try to get the function directly + try: + existing_function = modal.Function.from_name( + app_name, + function_name, + environment_name=environment_name or "main", + ) + logger.debug( + "Successfully retrieved function from existing app" + ) + return existing_function, app_name + except Exception as func_error: + logger.warning( + f"Function lookup failed: {func_error}, redeploying" + ) + # Fall through to deployment + + except Exception: + # App not found or other lookup error - deploy fresh app + logger.debug( + "No app found for current time window, deploying fresh app" + ) + + # Deploy the app + app.deploy(name=app_name, environment_name=environment_name or "main") + logger.info( + f"App '{app_name}' deployed with {effective_min_containers} warm containers" + ) + logger.info( + f"View real-time logs at: https://modal.com/apps/{app_name}" + ) + + except Exception as e: + logger.error(f"Deployment failed: {e}") + raise + + logger.info( + f"Modal app configured with min_containers={effective_min_containers}, max_containers={effective_max_containers}" + ) + + return execute_step_func, app_name + + +def get_resource_settings_from_deployment( + deployment: Any, + resource_settings_key: str = "resources", +) -> ResourceSettings: + """Extract resource settings from pipeline deployment. + + Args: + deployment: The pipeline deployment. + resource_settings_key: Key to look for resource settings. + + Returns: + ResourceSettings object with the configuration. + """ + pipeline_resource_settings: Union[Dict[str, Any], Any] = ( + deployment.pipeline_configuration.settings.get( + resource_settings_key, {} + ) + ) + if pipeline_resource_settings: + # Convert to dict if it's a BaseSettings instance + if hasattr(pipeline_resource_settings, "model_dump"): + pipeline_resource_dict = pipeline_resource_settings.model_dump() + else: + pipeline_resource_dict = pipeline_resource_settings + resource_settings = ResourceSettings.model_validate( + pipeline_resource_dict + ) + else: + # Fallback to first step's resource settings if no pipeline-level resources + if deployment.step_configurations: + first_step = list(deployment.step_configurations.values())[0] + resource_settings = first_step.config.resource_settings + else: + resource_settings = ResourceSettings() # Default empty settings + + return resource_settings + + +def stream_modal_logs_and_wait( + function_call: Any, + description: str, + app_name: str, + check_interval: float = 2.0, +) -> Any: + """Stream logs from Modal app using CLI and wait for FunctionCall completion. + + Args: + function_call: The Modal FunctionCall object from .spawn() + description: Description of the operation for logging. + app_name: Name of the Modal app to stream logs from. + check_interval: How often to check for completion (seconds). + + Returns: + The result of the function execution. + + Raises: + Exception: If the Modal function execution fails. + KeyboardInterrupt: If the user cancels the execution. + """ + logger.info(f"Starting {description}") + + # Start log streaming in a separate thread + log_stream_active = threading.Event() + log_stream_active.set() + + def stream_logs() -> None: + """Stream logs from Modal CLI in a separate thread.""" + try: + # Use modal CLI to stream logs + cmd = ["modal", "app", "logs", app_name, "--timestamps"] + logger.debug(f"Starting log stream: {' '.join(cmd)}") + + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, # Line buffered + universal_newlines=True, + ) + + # Stream logs line by line + while log_stream_active.is_set() and process.poll() is None: + if process.stdout: + line = process.stdout.readline() + if line: + # Clean up the log line and forward to our logger + log_msg = line.strip() + if log_msg and not log_msg.startswith("No logs"): + logger.info(f"[Modal] {log_msg}") + else: + break + + # Clean up process + if process.poll() is None: + process.terminate() + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + + except FileNotFoundError: + logger.warning( + "Modal CLI not found. Install with: pip install modal" + ) + except Exception as e: + logger.debug(f"Log streaming error: {e}") + + # Start log streaming thread + log_thread = threading.Thread(target=stream_logs, daemon=True) + log_thread.start() + + try: + # Poll for function completion + start_time = time.time() + while True: + try: + # Try to get result with timeout=0 (non-blocking) + result = function_call.get(timeout=0) + elapsed = time.time() - start_time + logger.info( + f"{description} completed successfully after {elapsed:.1f}s" + ) + return result + except TimeoutError: + # Function still running, continue waiting + time.sleep(check_interval) + except Exception as e: + # Function failed + elapsed = time.time() - start_time + logger.error(f"{description} failed after {elapsed:.1f}s: {e}") + raise + + except KeyboardInterrupt: + logger.info(f"Cancelling {description}") + try: + function_call.cancel() + logger.info("Function cancelled successfully") + except Exception as cancel_error: + logger.warning(f"Could not cancel function: {cancel_error}") + raise + finally: + # Stop log streaming + log_stream_active.clear() + # Give the log thread a moment to clean up + time.sleep(0.5) From d8318b148ac6b3ba149c9fb5b7c0053fbc8c0769 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 15:43:36 +0200 Subject: [PATCH 10/77] Update Modal integration to require Modal version 1 --- src/zenml/integrations/modal/__init__.py | 2 +- .../flavors/modal_orchestrator_flavor.py | 10 +++++-- .../flavors/modal_step_operator_flavor.py | 10 +++++-- .../modal/orchestrators/modal_orchestrator.py | 3 +- .../step_operators/modal_step_operator.py | 3 +- src/zenml/integrations/modal/utils.py | 30 +++++++++++++++---- 6 files changed, 43 insertions(+), 15 deletions(-) diff --git a/src/zenml/integrations/modal/__init__.py b/src/zenml/integrations/modal/__init__.py index c9a6597d53e..57703b049d4 100644 --- a/src/zenml/integrations/modal/__init__.py +++ b/src/zenml/integrations/modal/__init__.py @@ -30,7 +30,7 @@ class ModalIntegration(Integration): """Definition of Modal integration for ZenML.""" NAME = MODAL - REQUIREMENTS = ["modal>=0.64.49,<1"] + REQUIREMENTS = ["modal>=1"] @classmethod def flavors(cls) -> List[Type[Flavor]]: diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 82e9a0f9f0e..6899b72fe7e 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -68,13 +68,17 @@ class ModalOrchestratorConfig( """Modal orchestrator config optimized for BLAZING FAST execution. Attributes: - token: Modal API token for authentication. If not provided, - falls back to Modal's default authentication (~/.modal.toml). + token_id: Modal API token ID (ak-xxxxx format) for authentication. + token_secret: Modal API token secret (as-xxxxx format) for authentication. workspace: Modal workspace name (optional). environment: Modal environment name (optional). + + Note: If token_id and token_secret are not provided, falls back to + Modal's default authentication (~/.modal.toml). """ - token: Optional[SecretStr] = None + token_id: Optional[SecretStr] = None + token_secret: Optional[SecretStr] = None workspace: Optional[str] = None environment: Optional[str] = None diff --git a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py index ead81a08199..ee932ac596a 100644 --- a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py @@ -59,13 +59,17 @@ class ModalStepOperatorConfig( """Configuration for the Modal step operator. Attributes: - token: Modal API token for authentication. If not provided, - falls back to Modal's default authentication (~/.modal.toml). + token_id: Modal API token ID (ak-xxxxx format) for authentication. + token_secret: Modal API token secret (as-xxxxx format) for authentication. workspace: Modal workspace name (optional). environment: Modal environment name (optional). + + Note: If token_id and token_secret are not provided, falls back to + Modal's default authentication (~/.modal.toml). """ - token: Optional[SecretStr] = None + token_id: Optional[SecretStr] = None + token_secret: Optional[SecretStr] = None workspace: Optional[str] = None environment: Optional[str] = None diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 457640438ab..a50ea360ab0 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -197,7 +197,8 @@ def settings_class(self) -> Optional[Type["BaseSettings"]]: def _setup_modal_client(self) -> None: """Setup Modal client with authentication.""" setup_modal_client( - token=self.config.token, + token_id=self.config.token_id, + token_secret=self.config.token_secret, workspace=self.config.workspace, environment=self.config.environment, ) diff --git a/src/zenml/integrations/modal/step_operators/modal_step_operator.py b/src/zenml/integrations/modal/step_operators/modal_step_operator.py index 3ef448deded..aab59bfe409 100644 --- a/src/zenml/integrations/modal/step_operators/modal_step_operator.py +++ b/src/zenml/integrations/modal/step_operators/modal_step_operator.py @@ -123,7 +123,8 @@ def launch( # Setup Modal authentication setup_modal_client( - token=self.config.token, + token_id=self.config.token_id, + token_secret=self.config.token_secret, workspace=self.config.workspace, environment=self.config.environment, ) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index bc2fb4721ca..b5ab94368e3 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -40,21 +40,39 @@ def setup_modal_client( - token: Optional["SecretStr"] = None, + token_id: Optional["SecretStr"] = None, + token_secret: Optional["SecretStr"] = None, workspace: Optional[str] = None, environment: Optional[str] = None, ) -> None: """Setup Modal client with authentication. Args: - token: Modal API token for authentication. + token_id: Modal API token ID (ak-xxxxx format). + token_secret: Modal API token secret (as-xxxxx format). workspace: Modal workspace name. environment: Modal environment name. """ - if token: - # Set Modal token from config - os.environ["MODAL_TOKEN_ID"] = token.get_secret_value() - logger.info("Using Modal token from config") + if token_id and token_secret: + # Set both token ID and secret + os.environ["MODAL_TOKEN_ID"] = token_id.get_secret_value() + os.environ["MODAL_TOKEN_SECRET"] = token_secret.get_secret_value() + logger.info("Using Modal token ID and secret from config") + elif token_id: + # Only token ID provided + os.environ["MODAL_TOKEN_ID"] = token_id.get_secret_value() + logger.info("Using Modal token ID from config") + logger.warning( + "Only token ID provided. Make sure MODAL_TOKEN_SECRET is set " + "or Modal authentication may fail." + ) + elif token_secret: + # Only token secret provided (unusual) + os.environ["MODAL_TOKEN_SECRET"] = token_secret.get_secret_value() + logger.warning( + "Only token secret provided. Make sure MODAL_TOKEN_ID is set " + "or Modal authentication may fail." + ) else: logger.info("Using default Modal authentication (~/.modal.toml)") From b7400b6aa21d4cc98dc645695787da73707614b5 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 16:19:48 +0200 Subject: [PATCH 11/77] Introduce ModalExecutionMode enum for execution modes --- .../flavors/modal_orchestrator_flavor.py | 19 +++++++++++++++++-- .../modal/orchestrators/modal_orchestrator.py | 12 +++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 6899b72fe7e..8dba59c5458 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -13,6 +13,7 @@ # permissions and limitations under the License. """Modal orchestrator flavor.""" +from enum import Enum from typing import TYPE_CHECKING, Optional, Type from pydantic import SecretStr @@ -26,6 +27,18 @@ MODAL_ORCHESTRATOR_FLAVOR = "modal" +class ModalExecutionMode(str, Enum): + """Execution modes for the Modal orchestrator. + + Attributes: + PIPELINE: Execute entire pipeline in one Modal function (fastest, default). + PER_STEP: Execute each step in a separate Modal function (granular control). + """ + + PIPELINE = "pipeline" + PER_STEP = "per_step" + + class ModalOrchestratorSettings(BaseSettings): """Modal orchestrator settings. @@ -38,7 +51,7 @@ class ModalOrchestratorSettings(BaseSettings): timeout: Maximum execution time in seconds (default 24h). min_containers: Minimum containers to keep warm (replaces keep_warm). max_containers: Maximum concurrent containers (replaces concurrency_limit). - execution_mode: Execution mode - "pipeline" (default, fastest) or "per_step" (granular control). + execution_mode: Execution mode - PIPELINE (default, fastest) or PER_STEP (granular control). synchronous: Wait for completion (True) or fire-and-forget (False). app_warming_window_hours: Hours for app name window to enable container reuse. Apps are reused within this time window for efficiency. Default 2 hours. @@ -53,7 +66,9 @@ class ModalOrchestratorSettings(BaseSettings): 1 # Keep 1 container warm for sequential execution ) max_containers: Optional[int] = 10 # Allow up to 10 concurrent containers - execution_mode: str = "pipeline" # Default to fastest mode + execution_mode: ModalExecutionMode = ( + ModalExecutionMode.PIPELINE + ) # Default to fastest mode synchronous: bool = ( True # Wait for completion (True) or fire-and-forget (False) ) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index a50ea360ab0..8fdd3c029ad 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -64,6 +64,12 @@ ModalOrchestratorConfig, ModalOrchestratorSettings, ) + +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalExecutionMode, +) + +if TYPE_CHECKING: from zenml.models import PipelineDeploymentResponse, PipelineRunResponse from zenml.models.v2.core.pipeline_deployment import PipelineDeploymentBase @@ -335,7 +341,7 @@ def prepare_or_run_pipeline( # Create the execution function based on execution mode execution_mode = settings.execution_mode or self.config.execution_mode - if execution_mode == "per_step": + if execution_mode == ModalExecutionMode.PER_STEP: logger.debug("Creating per-step mode for granular execution") execution_func: Any = run_step_in_modal function_name = "run_step_in_modal" @@ -345,7 +351,7 @@ def prepare_or_run_pipeline( function_name = "run_entire_pipeline" # Get or deploy persistent Modal app with warm containers - mode_suffix = execution_mode.replace("_", "-") + mode_suffix = execution_mode.value.replace("_", "-") app_name_base = f"zenml-{deployment.pipeline_configuration.name.replace('_', '-')}-{mode_suffix}" execute_step, full_app_name = get_or_deploy_persistent_modal_app( @@ -406,7 +412,7 @@ def execute_modal_function( ) return function_call - if execution_mode == "per_step": + if execution_mode == ModalExecutionMode.PER_STEP: logger.info("Using per-step mode for granular execution") # Execute steps individually for step_name in step_names: From ee15117da6403ffcdc40b94acca0f9467caf9d0e Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 17:25:34 +0200 Subject: [PATCH 12/77] Refactor Modal authentication setup and deployment --- .../flavors/modal_orchestrator_flavor.py | 13 +- .../flavors/modal_step_operator_flavor.py | 7 +- .../modal/orchestrators/modal_orchestrator.py | 1 + src/zenml/integrations/modal/utils.py | 200 +++++++++++++++--- 4 files changed, 182 insertions(+), 39 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 8dba59c5458..f64b368f2fc 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -16,10 +16,9 @@ from enum import Enum from typing import TYPE_CHECKING, Optional, Type -from pydantic import SecretStr - from zenml.config.base_settings import BaseSettings from zenml.orchestrators import BaseOrchestratorConfig, BaseOrchestratorFlavor +from zenml.utils.secret_utils import SecretField if TYPE_CHECKING: from zenml.integrations.modal.orchestrators import ModalOrchestrator @@ -32,7 +31,8 @@ class ModalExecutionMode(str, Enum): Attributes: PIPELINE: Execute entire pipeline in one Modal function (fastest, default). - PER_STEP: Execute each step in a separate Modal function (granular control). + PER_STEP: Execute each step in a separate Modal function (granular + control). """ PIPELINE = "pipeline" @@ -43,7 +43,8 @@ class ModalOrchestratorSettings(BaseSettings): """Modal orchestrator settings. Attributes: - gpu: The type of GPU to use for the pipeline execution (e.g., "T4", "A100"). + gpu: The type of GPU to use for the pipeline execution (e.g., "T4", + "A100"). Use ResourceSettings.gpu_count to specify the number of GPUs. region: The region to use for the pipeline execution. cloud: The cloud provider to use for the pipeline execution. @@ -92,8 +93,8 @@ class ModalOrchestratorConfig( Modal's default authentication (~/.modal.toml). """ - token_id: Optional[SecretStr] = None - token_secret: Optional[SecretStr] = None + token_id: Optional[str] = SecretField(default=None) + token_secret: Optional[str] = SecretField(default=None) workspace: Optional[str] = None environment: Optional[str] = None diff --git a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py index ee932ac596a..89d103233f3 100644 --- a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py @@ -15,11 +15,10 @@ from typing import TYPE_CHECKING, Optional, Type -from pydantic import SecretStr - from zenml.config.base_settings import BaseSettings from zenml.integrations.modal import MODAL_STEP_OPERATOR_FLAVOR from zenml.step_operators import BaseStepOperatorConfig, BaseStepOperatorFlavor +from zenml.utils.secret_utils import SecretField if TYPE_CHECKING: from zenml.integrations.modal.step_operators import ModalStepOperator @@ -68,8 +67,8 @@ class ModalStepOperatorConfig( Modal's default authentication (~/.modal.toml). """ - token_id: Optional[SecretStr] = None - token_secret: Optional[SecretStr] = None + token_id: Optional[str] = SecretField(default=None) + token_secret: Optional[str] = SecretField(default=None) workspace: Optional[str] = None environment: Optional[str] = None diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 8fdd3c029ad..52fa03b53d7 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -359,6 +359,7 @@ def prepare_or_run_pipeline( zenml_image=zenml_image, execution_func=execution_func, function_name=function_name, + deployment=deployment, gpu_values=gpu_values, cpu_count=cpu_count, # Use ResourceSettings value or None (Modal default) memory_mb=memory_mb, # Use ResourceSettings value or None (Modal default) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index b5ab94368e3..033db1c63bf 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -17,7 +17,7 @@ import subprocess import threading import time -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union try: import modal @@ -30,18 +30,37 @@ from zenml.logger import get_logger from zenml.stack import Stack, StackValidator -if TYPE_CHECKING: - from pydantic import SecretStr - logger = get_logger(__name__) # Common environment variable for Modal orchestrator run ID ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID = "ZENML_MODAL_ORCHESTRATOR_RUN_ID" +class ModalAuthenticationError(Exception): + """Exception raised for Modal authentication issues with helpful guidance.""" + + def __init__(self, message: str, suggestions: Optional[List[str]] = None): + """Initialize the authentication error with message and optional suggestions. + + Args: + message: The error message. + suggestions: Optional list of suggestions for fixing the issue. + """ + super().__init__(message) + self.suggestions = suggestions or [] + + def __str__(self) -> str: + """Return formatted error message with suggestions.""" + base_message = super().__str__() + if self.suggestions: + suggestions_text = "\n".join(f" • {s}" for s in self.suggestions) + return f"{base_message}\n\nSuggestions:\n{suggestions_text}" + return base_message + + def setup_modal_client( - token_id: Optional["SecretStr"] = None, - token_secret: Optional["SecretStr"] = None, + token_id: Optional[str] = None, + token_secret: Optional[str] = None, workspace: Optional[str] = None, environment: Optional[str] = None, ) -> None: @@ -54,27 +73,70 @@ def setup_modal_client( environment: Modal environment name. """ if token_id and token_secret: + # Validate token format + if not token_id.startswith("ak-"): + logger.warning( + f"Token ID format may be invalid. Expected format: ak-xxxxx, " + f"got: {token_id[:10]}... (truncated for security)" + ) + + if not token_secret.startswith("as-"): + logger.warning( + f"Token secret format may be invalid. Expected format: as-xxxxx, " + f"got: {token_secret[:10]}... (truncated for security)" + ) + # Set both token ID and secret - os.environ["MODAL_TOKEN_ID"] = token_id.get_secret_value() - os.environ["MODAL_TOKEN_SECRET"] = token_secret.get_secret_value() + os.environ["MODAL_TOKEN_ID"] = token_id + os.environ["MODAL_TOKEN_SECRET"] = token_secret logger.info("Using Modal token ID and secret from config") + logger.debug(f"Token ID starts with: {token_id[:5]}...") + logger.debug(f"Token secret starts with: {token_secret[:5]}...") + elif token_id: + # Validate token format + if not token_id.startswith("ak-"): + logger.warning( + f"Token ID format may be invalid. Expected format: ak-xxxxx, " + f"got: {token_id[:10]}... (truncated for security)" + ) + # Only token ID provided - os.environ["MODAL_TOKEN_ID"] = token_id.get_secret_value() + os.environ["MODAL_TOKEN_ID"] = token_id logger.info("Using Modal token ID from config") logger.warning( "Only token ID provided. Make sure MODAL_TOKEN_SECRET is set " "or Modal authentication may fail." ) + logger.debug(f"Token ID starts with: {token_id[:5]}...") + elif token_secret: + # Validate token format + if not token_secret.startswith("as-"): + logger.warning( + f"Token secret format may be invalid. Expected format: as-xxxxx, " + f"got: {token_secret[:10]}... (truncated for security)" + ) + # Only token secret provided (unusual) - os.environ["MODAL_TOKEN_SECRET"] = token_secret.get_secret_value() + os.environ["MODAL_TOKEN_SECRET"] = token_secret logger.warning( "Only token secret provided. Make sure MODAL_TOKEN_ID is set " "or Modal authentication may fail." ) + logger.debug(f"Token secret starts with: {token_secret[:5]}...") + else: logger.info("Using default Modal authentication (~/.modal.toml)") + # Check if default auth exists + modal_toml_path = os.path.expanduser("~/.modal.toml") + if os.path.exists(modal_toml_path): + logger.debug(f"Found Modal config at {modal_toml_path}") + else: + logger.warning( + f"No Modal config found at {modal_toml_path}. " + "Run 'modal token new' to set up authentication." + ) # Set workspace/environment if provided if workspace: @@ -228,6 +290,7 @@ def get_or_deploy_persistent_modal_app( zenml_image: Any, execution_func: Any, function_name: str, + deployment: Any, gpu_values: Optional[str] = None, cpu_count: Optional[int] = None, memory_mb: Optional[int] = None, @@ -242,13 +305,15 @@ def get_or_deploy_persistent_modal_app( """Get or deploy a persistent Modal app with warm containers. This function deploys a Modal app that stays alive with warm containers - for maximum speed between runs. + for maximum speed between runs. The app name includes both time window + and build checksum to ensure fresh deployments only when builds actually change. Args: - app_name_base: Base name for the app (will be suffixed with timestamp). + app_name_base: Base name for the app (will be suffixed with timestamp and build hash). zenml_image: Pre-built ZenML Docker image for Modal. execution_func: The function to execute in the Modal app. function_name: Name of the function in the app. + deployment: The pipeline deployment containing build information. gpu_values: GPU configuration string. cpu_count: Number of CPU cores. memory_mb: Memory allocation in MB. @@ -273,9 +338,25 @@ def get_or_deploy_persistent_modal_app( ) # Convert hours to seconds time_window = current_time // window_seconds - app_name = f"{app_name_base}-{time_window}" + # Generate build identifier to ensure fresh deployments only when builds actually change + # Use deployment build checksum which only changes when Docker settings, requirements, etc. change + build_hash = "no-build" + if deployment.build and deployment.build.checksum: + # Use first 8 characters of build checksum for compact identifier + build_hash = deployment.build.checksum[:8] + logger.debug(f"Using build checksum: {deployment.build.checksum}") + else: + logger.warning( + "No build checksum available, using fallback identifier" + ) + + # Include both time window and build hash in app name + app_name = f"{app_name_base}-{time_window}-{build_hash}" logger.info(f"Getting/deploying persistent Modal app: {app_name}") + logger.debug( + f"App name includes time window: {time_window}, build hash: {build_hash}" + ) # Create the app app = modal.App(app_name) @@ -296,10 +377,10 @@ def get_or_deploy_persistent_modal_app( max_containers=effective_max_containers, # Allow scaling )(execution_func) - # Try to lookup existing app in current time window, deploy if not found + # Try to lookup existing app with matching time window and image, deploy if not found try: logger.debug( - f"Checking for Modal app in current time window: {app_name}" + f"Checking for Modal app with time window {time_window} and build hash {build_hash}: {app_name}" ) try: @@ -307,7 +388,7 @@ def get_or_deploy_persistent_modal_app( app_name, environment_name=environment_name or "main" ) logger.info( - f"Found existing app '{app_name}' with fresh tokens - reusing warm containers" + f"Found existing app '{app_name}' with matching build and fresh time window - reusing warm containers" ) # Try to get the function directly @@ -330,17 +411,38 @@ def get_or_deploy_persistent_modal_app( except Exception: # App not found or other lookup error - deploy fresh app logger.debug( - "No app found for current time window, deploying fresh app" + "No app found for current time window and build hash, deploying fresh app" ) - # Deploy the app - app.deploy(name=app_name, environment_name=environment_name or "main") - logger.info( - f"App '{app_name}' deployed with {effective_min_containers} warm containers" - ) - logger.info( - f"View real-time logs at: https://modal.com/apps/{app_name}" - ) + # Deploy the app with better error handling + try: + app.deploy( + name=app_name, environment_name=environment_name or "main" + ) + logger.info( + f"App '{app_name}' deployed with {effective_min_containers} warm containers" + ) + logger.info( + f"View real-time logs at: https://modal.com/apps/{app_name}" + ) + except Exception as deploy_error: + error_message = str(deploy_error) + if ( + "Token ID is malformed" in error_message + or "UNAUTHENTICATED" in error_message + ): + raise ModalAuthenticationError( + "Modal authentication failed. Token ID or secret is invalid.", + suggestions=[ + "Check that token_id starts with 'ak-' and token_secret starts with 'as-'", + "Get new tokens from Modal dashboard: https://modal.com/tokens", + "Or run 'modal token new' to set up ~/.modal.toml authentication", + "Ensure both token_id AND token_secret are provided in orchestrator config", + ], + ) from deploy_error + else: + # Re-raise other deployment errors as-is + raise except Exception as e: logger.error(f"Deployment failed: {e}") @@ -414,15 +516,35 @@ def stream_modal_logs_and_wait( """ logger.info(f"Starting {description}") + # Get function call ID for filtering (if available) + call_id = None + try: + call_id = getattr(function_call, "object_id", None) + if call_id: + logger.debug(f"Function call ID: {call_id}") + except Exception: + pass + + # Wait a moment for the function to start before beginning log streaming + # This helps avoid capturing old logs from previous runs + time.sleep(1) + # Start log streaming in a separate thread log_stream_active = threading.Event() log_stream_active.set() + start_time = time.time() def stream_logs() -> None: """Stream logs from Modal CLI in a separate thread.""" try: - # Use modal CLI to stream logs - cmd = ["modal", "app", "logs", app_name, "--timestamps"] + # Use modal CLI to stream logs (automatically streams while app is active) + cmd = [ + "modal", + "app", + "logs", + app_name, + "--timestamps", + ] logger.debug(f"Starting log stream: {' '.join(cmd)}") process = subprocess.Popen( @@ -441,8 +563,28 @@ def stream_logs() -> None: if line: # Clean up the log line and forward to our logger log_msg = line.strip() - if log_msg and not log_msg.startswith("No logs"): - logger.info(f"[Modal] {log_msg}") + + # Skip empty lines and "No logs" messages + if not log_msg or log_msg.startswith("No logs"): + continue + + # Try to filter out old logs by checking if they contain our call ID + # or if they occurred after we started the function + current_time = time.time() + log_age = current_time - start_time + + # Only show logs that are likely from the current execution + # Skip logs that seem to be from much earlier runs + if call_id and call_id in log_msg: + # This log definitely belongs to our execution + logger.info(f"{log_msg}") + elif ( + log_age < 300 + ): # Only show logs from last 5 minutes + # This log is recent enough to likely be ours + logger.info(f"{log_msg}") + # Else: skip old logs + else: break From 85aeef47f873ed1932936077d38f561e98a6c5ae Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 12 Jun 2025 17:32:42 +0200 Subject: [PATCH 13/77] Update platform references in log messages --- .../modal/orchestrators/modal_orchestrator.py | 16 ++++---- src/zenml/integrations/modal/utils.py | 41 +++++++++---------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 52fa03b53d7..51a69977a9d 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -91,7 +91,7 @@ def run_step_in_modal( Raises: Exception: If step execution fails. """ - logger.info(f"Running step '{step_name}' in Modal") + logger.info(f"Running step '{step_name}' remotely") sys.stdout.flush() # Set the orchestrator run ID in the Modal environment @@ -296,11 +296,11 @@ def prepare_or_run_pipeline( _ = placeholder_run # Mark as intentionally unused if modal is None: raise RuntimeError( - "Modal is not installed. Please install it with: pip install modal" + "Required dependencies not installed. Please install with: pip install modal" ) if deployment.schedule: logger.warning( - "Modal Orchestrator currently does not support the " + "Serverless Orchestrator currently does not support the " "use of schedules. The `schedule` will be ignored " "and the pipeline will be run immediately." ) @@ -333,7 +333,7 @@ def prepare_or_run_pipeline( # Execute steps using Modal's fast container spin-up with persistent app logger.info( - "Starting pipeline execution with persistent Modal functions" + "Starting pipeline execution with persistent serverless functions" ) step_names = list(deployment.step_configurations.keys()) @@ -376,7 +376,9 @@ def prepare_or_run_pipeline( or self.config.app_warming_window_hours, ) - logger.info("Executing with deployed Modal app and warm containers") + logger.info( + "Executing with deployed serverless application and warm containers" + ) # Execute based on execution mode with improved Modal Function API usage sync_execution = ( @@ -424,7 +426,7 @@ def execute_modal_function( ) except Exception as e: logger.error(f"Step '{step_name}' failed: {e}") - logger.info("Check Modal dashboard for detailed logs") + logger.info("Check platform dashboard for detailed logs") raise else: # Default: execute entire pipeline in one function @@ -435,7 +437,7 @@ def execute_modal_function( ) except Exception as e: logger.error(f"Pipeline failed: {e}") - logger.info("Check Modal dashboard for detailed logs") + logger.info("Check platform dashboard for detailed logs") raise run_duration = time.time() - start_time diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 033db1c63bf..b513a620419 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -89,7 +89,7 @@ def setup_modal_client( # Set both token ID and secret os.environ["MODAL_TOKEN_ID"] = token_id os.environ["MODAL_TOKEN_SECRET"] = token_secret - logger.info("Using Modal token ID and secret from config") + logger.info("Using platform token ID and secret from config") logger.debug(f"Token ID starts with: {token_id[:5]}...") logger.debug(f"Token secret starts with: {token_secret[:5]}...") @@ -103,10 +103,10 @@ def setup_modal_client( # Only token ID provided os.environ["MODAL_TOKEN_ID"] = token_id - logger.info("Using Modal token ID from config") + logger.info("Using platform token ID from config") logger.warning( "Only token ID provided. Make sure MODAL_TOKEN_SECRET is set " - "or Modal authentication may fail." + "or platform authentication may fail." ) logger.debug(f"Token ID starts with: {token_id[:5]}...") @@ -122,19 +122,19 @@ def setup_modal_client( os.environ["MODAL_TOKEN_SECRET"] = token_secret logger.warning( "Only token secret provided. Make sure MODAL_TOKEN_ID is set " - "or Modal authentication may fail." + "or platform authentication may fail." ) logger.debug(f"Token secret starts with: {token_secret[:5]}...") else: - logger.info("Using default Modal authentication (~/.modal.toml)") + logger.info("Using default platform authentication (~/.modal.toml)") # Check if default auth exists modal_toml_path = os.path.expanduser("~/.modal.toml") if os.path.exists(modal_toml_path): - logger.debug(f"Found Modal config at {modal_toml_path}") + logger.debug(f"Found platform config at {modal_toml_path}") else: logger.warning( - f"No Modal config found at {modal_toml_path}. " + f"No platform config found at {modal_toml_path}. " "Run 'modal token new' to set up authentication." ) @@ -253,11 +253,11 @@ def create_modal_stack_validator() -> StackValidator: def _validate_remote_components(stack: "Stack") -> Tuple[bool, str]: if stack.artifact_store.config.is_local: return False, ( - "Modal components run code remotely and " + "Serverless components run code remotely and " "need to write files into the artifact store, but the " f"artifact store `{stack.artifact_store.name}` of the " "active stack is local. Please ensure that your stack " - "contains a remote artifact store when using Modal " + "contains a remote artifact store when using serverless " "components." ) @@ -266,11 +266,11 @@ def _validate_remote_components(stack: "Stack") -> Tuple[bool, str]: if container_registry.config.is_local: return False, ( - "Modal components run code remotely and " + "Serverless components run code remotely and " "need to push/pull Docker images, but the " f"container registry `{container_registry.name}` of the " "active stack is local. Please ensure that your stack " - "contains a remote container registry when using Modal " + "contains a remote container registry when using serverless " "components." ) @@ -353,7 +353,9 @@ def get_or_deploy_persistent_modal_app( # Include both time window and build hash in app name app_name = f"{app_name_base}-{time_window}-{build_hash}" - logger.info(f"Getting/deploying persistent Modal app: {app_name}") + logger.info( + f"Getting/deploying persistent serverless application: {app_name}" + ) logger.debug( f"App name includes time window: {time_window}, build hash: {build_hash}" ) @@ -380,7 +382,7 @@ def get_or_deploy_persistent_modal_app( # Try to lookup existing app with matching time window and image, deploy if not found try: logger.debug( - f"Checking for Modal app with time window {time_window} and build hash {build_hash}: {app_name}" + f"Checking for serverless application with time window {time_window} and build hash {build_hash}: {app_name}" ) try: @@ -422,9 +424,6 @@ def get_or_deploy_persistent_modal_app( logger.info( f"App '{app_name}' deployed with {effective_min_containers} warm containers" ) - logger.info( - f"View real-time logs at: https://modal.com/apps/{app_name}" - ) except Exception as deploy_error: error_message = str(deploy_error) if ( @@ -432,10 +431,10 @@ def get_or_deploy_persistent_modal_app( or "UNAUTHENTICATED" in error_message ): raise ModalAuthenticationError( - "Modal authentication failed. Token ID or secret is invalid.", + "Platform authentication failed. Token ID or secret is invalid.", suggestions=[ "Check that token_id starts with 'ak-' and token_secret starts with 'as-'", - "Get new tokens from Modal dashboard: https://modal.com/tokens", + "Get new tokens from the platform dashboard", "Or run 'modal token new' to set up ~/.modal.toml authentication", "Ensure both token_id AND token_secret are provided in orchestrator config", ], @@ -449,7 +448,7 @@ def get_or_deploy_persistent_modal_app( raise logger.info( - f"Modal app configured with min_containers={effective_min_containers}, max_containers={effective_max_containers}" + f"Serverless application configured with min_containers={effective_min_containers}, max_containers={effective_max_containers}" ) return execute_step_func, app_name @@ -535,7 +534,7 @@ def stream_modal_logs_and_wait( start_time = time.time() def stream_logs() -> None: - """Stream logs from Modal CLI in a separate thread.""" + """Stream logs from platform CLI in a separate thread.""" try: # Use modal CLI to stream logs (automatically streams while app is active) cmd = [ @@ -598,7 +597,7 @@ def stream_logs() -> None: except FileNotFoundError: logger.warning( - "Modal CLI not found. Install with: pip install modal" + "Platform CLI not found. Install with: pip install modal" ) except Exception as e: logger.debug(f"Log streaming error: {e}") From 01f7ef29878eddee3dc4cbcc309a8d9c55b75cf8 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 13 Jun 2025 07:58:06 +0200 Subject: [PATCH 14/77] Include pipeline run ID for isolation and conflict prevention --- .../modal/orchestrators/modal_orchestrator.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 51a69977a9d..8cd358c624c 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -91,7 +91,12 @@ def run_step_in_modal( Raises: Exception: If step execution fails. """ - logger.info(f"Running step '{step_name}' remotely") + # Get pipeline run ID for debugging/logging + pipeline_run_id = os.environ.get("ZENML_PIPELINE_RUN_ID", "unknown") + + logger.info( + f"Running step '{step_name}' remotely (pipeline run: {pipeline_run_id})" + ) sys.stdout.flush() # Set the orchestrator run ID in the Modal environment @@ -136,11 +141,15 @@ def run_entire_pipeline( Raises: Exception: If pipeline execution fails. """ + # Get pipeline run ID for debugging/logging + pipeline_run_id = os.environ.get("ZENML_PIPELINE_RUN_ID", "unknown") + logger.info( "Starting entire pipeline using PipelineEntrypointConfiguration", extra={ "deployment_id": deployment_id, "orchestrator_run_id": orchestrator_run_id, + "pipeline_run_id": pipeline_run_id, }, ) @@ -308,9 +317,18 @@ def prepare_or_run_pipeline( # Setup Modal authentication self._setup_modal_client() - # Generate orchestrator run ID + # Generate orchestrator run ID and include pipeline run ID for isolation orchestrator_run_id = str(uuid4()) + + # Include pipeline run ID to prevent conflicts when same container + # handles multiple pipeline runs rapidly + pipeline_run_id = placeholder_run.id if placeholder_run else "unknown" + environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id + environment["ZENML_PIPELINE_RUN_ID"] = str(pipeline_run_id) + + logger.debug(f"Pipeline run ID: {pipeline_run_id}") + logger.debug(f"Orchestrator run ID: {orchestrator_run_id}") # Get settings from pipeline configuration (applies to entire pipeline) settings = cast( From 481972b413c9c9401b3af8786c44ead658304801 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 13 Jun 2025 15:42:09 +0200 Subject: [PATCH 15/77] Refactor Modal log streaming and improve resource selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract nested log streaming function into ModalLogStreamer class for better code organization - Remove unreliable timezone-based log filtering that could miss logs due to clock skew - Implement smarter resource fallback: use highest requirements across all steps instead of potentially unrepresentative first step - Add logging for resource selection decisions to improve debugging - Fix function-in-function code smell identified in PR review 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/zenml/integrations/modal/utils.py | 214 ++++++++++++++++---------- 1 file changed, 131 insertions(+), 83 deletions(-) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index b513a620419..38101692886 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -36,6 +36,102 @@ ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID = "ZENML_MODAL_ORCHESTRATOR_RUN_ID" +class ModalLogStreamer: + """Stream logs from Modal CLI in a separate thread.""" + + def __init__(self, app_name: str, call_id: Optional[str], logger: Any): + """Initialize the log streamer. + + Args: + app_name: Name of the Modal app to stream logs from. + call_id: Optional function call ID for filtering logs. + logger: Logger instance to use for output. + """ + self.app_name = app_name + self.call_id = call_id + self.logger = logger + self.log_stream_active = threading.Event() + self.log_thread: Optional[threading.Thread] = None + + def start(self) -> None: + """Start log streaming in a background thread.""" + self.log_stream_active.set() + self.log_thread = threading.Thread( + target=self._stream_logs, daemon=True + ) + self.log_thread.start() + + def stop(self) -> None: + """Stop log streaming.""" + self.log_stream_active.clear() + if self.log_thread: + # Give the log thread a moment to clean up + time.sleep(0.5) + + def _stream_logs(self) -> None: + """Stream logs from Modal CLI.""" + try: + # Use modal CLI to stream logs (automatically streams while app is active) + cmd = [ + "modal", + "app", + "logs", + self.app_name, + "--timestamps", + ] + self.logger.debug(f"Starting log stream: {' '.join(cmd)}") + + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, # Line buffered + universal_newlines=True, + ) + + # Stream logs line by line + while self.log_stream_active.is_set() and process.poll() is None: + if process.stdout: + line = process.stdout.readline() + if line: + # Clean up the log line and forward to our logger + log_msg = line.strip() + + # Skip empty lines and "No logs" messages + if not log_msg or log_msg.startswith("No logs"): + continue + + # Filter logs based on function call ID when available + # If no call ID, show all logs (Modal CLI handles recency) + if self.call_id and self.call_id in log_msg: + # This log definitely belongs to our execution + self.logger.info(f"{log_msg}") + elif not self.call_id: + # No call ID available, show all logs from Modal CLI stream + # Modal CLI already filters for recent/relevant logs + self.logger.info(f"{log_msg}") + # Else: skip logs that don't match our call ID + + else: + break + + # Clean up process + if process.poll() is None: + process.terminate() + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + + except FileNotFoundError: + self.logger.warning( + "Modal CLI not found. Install with: pip install modal" + ) + except Exception as e: + self.logger.debug(f"Log streaming error: {e}") + + class ModalAuthenticationError(Exception): """Exception raised for Modal authentication issues with helpful guidance.""" @@ -482,10 +578,38 @@ def get_resource_settings_from_deployment( pipeline_resource_dict ) else: - # Fallback to first step's resource settings if no pipeline-level resources + # Fallback to highest resource requirements across all steps if deployment.step_configurations: - first_step = list(deployment.step_configurations.values())[0] - resource_settings = first_step.config.resource_settings + # Find step with highest resource requirements for modal execution + max_cpu = 0 + max_memory = 0 + max_gpu = 0 + best_step_resources = ResourceSettings() + + for step_config in deployment.step_configurations.values(): + step_resources = step_config.config.resource_settings + step_cpu = step_resources.cpu_count or 0 + step_memory = step_resources.get_memory() or 0 + step_gpu = step_resources.gpu_count or 0 + + # Calculate resource "score" to find most demanding step + resource_score = ( + step_cpu + (step_memory / 1024) + (step_gpu * 10) + ) + best_score = max_cpu + (max_memory / 1024) + (max_gpu * 10) + + if resource_score > best_score: + max_cpu = step_cpu + max_memory = step_memory + max_gpu = step_gpu + best_step_resources = step_resources + + logger.info( + f"No pipeline-level resource settings found. Using highest resource " + f"requirements from steps: {max_cpu} CPUs, {max_memory / 1024:.1f}GB memory, " + f"{max_gpu} GPUs" + ) + resource_settings = best_step_resources else: resource_settings = ResourceSettings() # Default empty settings @@ -528,83 +652,9 @@ def stream_modal_logs_and_wait( # This helps avoid capturing old logs from previous runs time.sleep(1) - # Start log streaming in a separate thread - log_stream_active = threading.Event() - log_stream_active.set() - start_time = time.time() - - def stream_logs() -> None: - """Stream logs from platform CLI in a separate thread.""" - try: - # Use modal CLI to stream logs (automatically streams while app is active) - cmd = [ - "modal", - "app", - "logs", - app_name, - "--timestamps", - ] - logger.debug(f"Starting log stream: {' '.join(cmd)}") - - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, # Line buffered - universal_newlines=True, - ) - - # Stream logs line by line - while log_stream_active.is_set() and process.poll() is None: - if process.stdout: - line = process.stdout.readline() - if line: - # Clean up the log line and forward to our logger - log_msg = line.strip() - - # Skip empty lines and "No logs" messages - if not log_msg or log_msg.startswith("No logs"): - continue - - # Try to filter out old logs by checking if they contain our call ID - # or if they occurred after we started the function - current_time = time.time() - log_age = current_time - start_time - - # Only show logs that are likely from the current execution - # Skip logs that seem to be from much earlier runs - if call_id and call_id in log_msg: - # This log definitely belongs to our execution - logger.info(f"{log_msg}") - elif ( - log_age < 300 - ): # Only show logs from last 5 minutes - # This log is recent enough to likely be ours - logger.info(f"{log_msg}") - # Else: skip old logs - - else: - break - - # Clean up process - if process.poll() is None: - process.terminate() - try: - process.wait(timeout=5) - except subprocess.TimeoutExpired: - process.kill() - - except FileNotFoundError: - logger.warning( - "Platform CLI not found. Install with: pip install modal" - ) - except Exception as e: - logger.debug(f"Log streaming error: {e}") - - # Start log streaming thread - log_thread = threading.Thread(target=stream_logs, daemon=True) - log_thread.start() + # Create and start log streaming + log_streamer = ModalLogStreamer(app_name, call_id, logger) + log_streamer.start() try: # Poll for function completion @@ -637,6 +687,4 @@ def stream_logs() -> None: raise finally: # Stop log streaming - log_stream_active.clear() - # Give the log thread a moment to clean up - time.sleep(0.5) + log_streamer.stop() From 12ac7c48fc3e05a3c228516e9c57197d4619356b Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 13 Jun 2025 15:42:17 +0200 Subject: [PATCH 16/77] Clean up Modal orchestrator imports and code organization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Combine duplicate TYPE_CHECKING blocks into single import section - Improve import organization and reduce redundancy - Maintain all existing functionality while improving code structure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .../modal/orchestrators/modal_orchestrator.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 8cd358c624c..5b66fcabdc4 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -43,6 +43,9 @@ from zenml.entrypoints.step_entrypoint_configuration import ( StepEntrypointConfiguration, ) +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalExecutionMode, +) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, build_modal_image, @@ -64,12 +67,6 @@ ModalOrchestratorConfig, ModalOrchestratorSettings, ) - -from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( - ModalExecutionMode, -) - -if TYPE_CHECKING: from zenml.models import PipelineDeploymentResponse, PipelineRunResponse from zenml.models.v2.core.pipeline_deployment import PipelineDeploymentBase From 788ae84bc274a5b6d304d994b8845d3c30c46811 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 13 Jun 2025 16:16:30 +0200 Subject: [PATCH 17/77] Improve Modal integration import consistency and requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Import MODAL_ORCHESTRATOR_FLAVOR constant from central location to avoid duplication - Update requirements to modal>=1 after testing compatibility with both orchestrator and step operator - Remove unnecessary utils import that was only for mypy discovery - Maintain consistent import patterns across Modal integration files 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/zenml/integrations/modal/__init__.py | 2 -- .../integrations/modal/flavors/modal_orchestrator_flavor.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/zenml/integrations/modal/__init__.py b/src/zenml/integrations/modal/__init__.py index 57703b049d4..fe36ae7e6f3 100644 --- a/src/zenml/integrations/modal/__init__.py +++ b/src/zenml/integrations/modal/__init__.py @@ -47,7 +47,5 @@ def flavors(cls) -> List[Type[Flavor]]: return [ModalOrchestratorFlavor, ModalStepOperatorFlavor] -# Import utils module to make it discoverable by mypy -from zenml.integrations.modal import utils # noqa: F401 diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index f64b368f2fc..8cc63370868 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -23,7 +23,7 @@ if TYPE_CHECKING: from zenml.integrations.modal.orchestrators import ModalOrchestrator -MODAL_ORCHESTRATOR_FLAVOR = "modal" +from zenml.integrations.modal import MODAL_ORCHESTRATOR_FLAVOR class ModalExecutionMode(str, Enum): From 8cdb3c23ea8a2eefe7cc5a4ead5a83618ccc5f29 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 13 Jun 2025 16:16:43 +0200 Subject: [PATCH 18/77] Comprehensive Modal orchestrator documentation improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on PR review feedback: - Fix token authentication examples to use --token-id and --token-secret - Add "When NOT to use it" section with clear tradeoffs and alternatives - Add info boxes for environment separation best practices and cost implications - Document Modal vs Step Operator differences with usage recommendations - Add GPU base image requirements and CUDA compatibility warnings - Clarify execution modes: "pipeline" mode reduces overhead vs enables parallelism - Document resource fallback behavior and warming window defaults - Add container warming cost implications with specific guidance - Remove tracking pixel per review request - Improve overall documentation clarity and completeness 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .../component-guide/orchestrators/modal.md | 76 +++++++++++++++++-- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index 014d4e296fa..f68ae490078 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -22,6 +22,20 @@ You should use the Modal orchestrator if: * you need easy access to GPUs and high-performance computing resources. * you prefer a simple setup process without complex Kubernetes configurations. +## When NOT to use it + +The Modal orchestrator may not be the best choice if: + +* **You need fine-grained step isolation**: Modal runs entire pipelines in single functions by default, which means all steps share the same resources and environment. For pipelines requiring different resource configurations per step, consider the [Modal step operator](../step-operators/modal.md) instead. + +* **You have strict data locality requirements**: Modal runs in specific cloud regions and may not be suitable if you need to keep data processing within specific geographic boundaries or on-premises. + +* **You require very long-running pipelines**: While Modal supports up to 24-hour timeouts, extremely long-running batch jobs (days/weeks) might be better suited for other orchestrators. + +* **You need complex workflow patterns**: Modal orchestrator is optimized for straightforward ML pipelines. If you need complex DAG patterns, conditional logic, or dynamic pipeline generation, other orchestrators might be more suitable. + +* **Cost optimization for infrequent workloads**: While Modal is cost-effective for regular workloads, very infrequent pipelines (running once per month) might benefit from traditional infrastructure that doesn't incur per-execution overhead. + ## How to deploy it The Modal orchestrator runs on Modal's cloud infrastructure, so you don't need to deploy or manage any servers. You just need: @@ -69,7 +83,8 @@ zenml stack register -o ... --set # Register the orchestrator with explicit credentials zenml orchestrator register \ --flavor=modal \ - --token= \ + --token-id= \ + --token-secret= \ --workspace= \ --synchronous=true @@ -97,6 +112,17 @@ You can access the Modal dashboard at [modal.com/apps](https://modal.com/apps) t ### Configuration overview +{% hint style="info" %} +**Modal Orchestrator vs Step Operator** + +ZenML offers both a [Modal orchestrator](modal.md) and a [Modal step operator](../step-operators/modal.md). Choose based on your needs: + +- **Modal Orchestrator**: Runs entire pipelines on Modal's infrastructure. Best for complete pipeline execution with consistent resource requirements. +- **Modal Step Operator**: Runs individual steps on Modal while keeping orchestration local. Best for selectively running compute-intensive steps (like training) on Modal while keeping other steps local. + +Use the orchestrator for full cloud execution, use the step operator for hybrid local/cloud workflows. +{% endhint %} + The Modal orchestrator uses two types of settings following ZenML's standard pattern: 1. **`ResourceSettings`** (standard ZenML) - for hardware resource quantities: @@ -158,6 +184,8 @@ def my_modal_pipeline(): {% hint style="info" %} **Pipeline-Level Resources**: The Modal orchestrator uses pipeline-level resource settings to configure the Modal function for the entire pipeline. All steps share the same Modal function resources. Configure resources at the `@pipeline` level for best results. + +**Resource Fallback Behavior**: If no pipeline-level resource settings are provided, the orchestrator will automatically use the highest resource requirements found across all steps in the pipeline. This ensures adequate resources for all steps while maintaining the single-function execution model. {% endhint %} You can configure pipeline-wide resource requirements using `ResourceSettings` for hardware resources and `ModalOrchestratorSettings` for Modal-specific configurations: @@ -202,8 +230,8 @@ def second_step(): The Modal orchestrator supports two execution modes: -1. **`pipeline` (default)**: Runs the entire pipeline in a single Modal function for maximum speed and cost efficiency -2. **`per_step`**: Runs each step in a separate Modal function call for granular control and debugging +1. **`pipeline` (default)**: Runs the entire pipeline in a single Modal function for minimal overhead and cost efficiency. Steps execute sequentially with no cold starts or function call overhead between them. +2. **`per_step`**: Runs each step in a separate Modal function call for granular control and debugging. Better for pipelines where steps can run in parallel or have very different resource requirements. {% hint style="info" %} **Resource Sharing**: Both execution modes use the same Modal function with the same resource configuration (from pipeline-level settings). The difference is whether steps run sequentially in one function call (`pipeline`) or as separate function calls (`per_step`). @@ -225,6 +253,19 @@ modal_settings = ModalOrchestratorSettings( Modal makes it easy to use GPUs for your ML workloads. Use `ResourceSettings` to specify the number of GPUs and `ModalOrchestratorSettings` to specify the GPU type: +{% hint style="warning" %} +**Base Image Requirements for GPU Usage** + +When using GPUs, ensure your base Docker image includes the appropriate CUDA runtime and drivers. Modal's GPU instances come with CUDA pre-installed, but your application dependencies (like PyTorch, TensorFlow) must be compatible with the CUDA version. + +For optimal GPU performance: +- Use CUDA-compatible base images (e.g., `nvidia/cuda:11.8-runtime-ubuntu20.04`) +- Install GPU-compatible versions of ML frameworks in your Docker requirements +- Test your GPU setup locally before deploying to Modal + +ZenML will use your base image configuration from the container registry, so ensure GPU compatibility is built into your image. +{% endhint %} + ```python from zenml.config import ResourceSettings from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( @@ -303,6 +344,18 @@ modal_settings = ModalOrchestratorSettings( ### Authentication with different environments +{% hint style="info" %} +**Best Practice: Separate Stacks for Different Environments** + +Consider creating separate ZenML stacks for different environments (development, staging, production), each configured with different Modal environments and workspaces. This provides better isolation and allows for different resource configurations per environment. + +For example: +- **Development stack**: Uses Modal "dev" environment with smaller resource limits +- **Production stack**: Uses Modal "production" environment with production-grade resources and credentials + +This approach helps prevent accidental deployment to production and allows for environment-specific configurations. +{% endhint %} + For production deployments, you can specify different Modal environments: ```python @@ -333,6 +386,19 @@ def my_pipeline(): This ensures your pipelines start executing immediately without waiting for container initialization. +{% hint style="info" %} +**Cost Implications of Container Warming** + +Keeping warm containers (`min_containers > 0`) incurs costs even when your pipelines are not running, as Modal charges for idle container time. Consider these factors: + +- **Development**: Use `min_containers=0` to minimize costs during development +- **Production**: Use `min_containers=1-2` for fast startup times on frequent workloads +- **Resource costs**: Warm containers with GPUs are significantly more expensive than CPU-only containers +- **Time window**: Containers are reused within a 2-hour window by default (configurable via `app_warming_window_hours`) + +Monitor your Modal dashboard to track idle container costs and adjust settings based on your usage patterns. +{% endhint %} + ## Best practices 1. **Use pipeline mode for production**: The default `pipeline` execution mode runs your entire pipeline in one function, minimizing overhead and cost. @@ -370,6 +436,4 @@ This ensures your pipelines start executing immediately without waiting for cont - Monitor your functions in the [Modal dashboard](https://modal.com/apps) - Use `zenml logs` to view detailed pipeline execution logs -For more information and a full list of configurable attributes of the Modal orchestrator, check out the [SDK Docs](https://sdkdocs.zenml.io/latest/integration_code_docs/integrations-modal.html#zenml.integrations.modal.orchestrators). - -
ZenML Scarf
\ No newline at end of file +For more information and a full list of configurable attributes of the Modal orchestrator, check out the [SDK Docs](https://sdkdocs.zenml.io/latest/integration_code_docs/integrations-modal.html#zenml.integrations.modal.orchestrators). \ No newline at end of file From f2218b1df23ecf6218db106dc3452be8b1dbaa0d Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 13 Jun 2025 22:56:18 +0200 Subject: [PATCH 19/77] Refactor error message formatting and GPU count handling --- src/zenml/integrations/modal/utils.py | 15 ++++++++++----- .../step_operators/test_modal_step_operator.py | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 38101692886..5f0885fbffa 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -146,7 +146,11 @@ def __init__(self, message: str, suggestions: Optional[List[str]] = None): self.suggestions = suggestions or [] def __str__(self) -> str: - """Return formatted error message with suggestions.""" + """Return formatted error message with suggestions. + + Returns: + Formatted error message, optionally with suggestions. + """ base_message = super().__str__() if self.suggestions: suggestions_text = "\n".join(f" • {s}" for s in self.suggestions) @@ -255,9 +259,10 @@ def get_gpu_values( """ if not gpu_type: return None - # Prefer resource_settings gpu_count, fallback to 1 - gpu_count = resource_settings.gpu_count or 1 - return f"{gpu_type}:{gpu_count}" if gpu_count > 1 else gpu_type + gpu_count = resource_settings.gpu_count + if gpu_count is None or gpu_count == 0: + return gpu_type + return f"{gpu_type}:{gpu_count}" def get_resource_values( @@ -425,7 +430,7 @@ def get_or_deploy_persistent_modal_app( Tuple of (Modal function ready for execution, full app name). Raises: - Exception: If deployment fails. + ModalAuthenticationError: If platform authentication fails. """ # Create timestamp window for app reuse (rounds down to nearest window boundary) current_time = int(time.time()) diff --git a/tests/integration/integrations/modal/step_operators/test_modal_step_operator.py b/tests/integration/integrations/modal/step_operators/test_modal_step_operator.py index 865f6d55265..b418f72f6d7 100644 --- a/tests/integration/integrations/modal/step_operators/test_modal_step_operator.py +++ b/tests/integration/integrations/modal/step_operators/test_modal_step_operator.py @@ -42,5 +42,5 @@ def test_get_gpu_values(gpu, gpu_count, expected_result): settings = ModalStepOperatorSettings(gpu=gpu) resource_settings = ResourceSettings(gpu_count=gpu_count) - result = get_gpu_values(settings, resource_settings) + result = get_gpu_values(settings.gpu, resource_settings) assert result == expected_result From e9e25746be87fe9dcc9df2592b981fa71213231b Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 13 Jun 2025 23:10:29 +0200 Subject: [PATCH 20/77] Add base image requirements and GPU configuration details --- .../component-guide/orchestrators/modal.md | 105 +++++++++- .../component-guide/step-operators/modal.md | 188 +++++++++++++++++- 2 files changed, 275 insertions(+), 18 deletions(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index f68ae490078..c46022c94c5 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -55,9 +55,8 @@ To use the Modal orchestrator, we need: * [Docker](https://www.docker.com) installed and running. * A [remote artifact store](../artifact-stores/README.md) as part of your stack. * A [remote container registry](../container-registries/README.md) as part of your stack. -* Modal CLI installed and authenticated: +* Modal authenticated: ```shell - pip install modal modal setup ``` @@ -95,7 +94,7 @@ zenml stack register -o ... --set You can get your Modal token from the [Modal dashboard](https://modal.com/settings/tokens). {% hint style="info" %} -ZenML will build a Docker image called `/zenml:` which includes your code and use it to run your pipeline steps in Modal functions. Check out [this page](https://docs.zenml.io/how-to/customize-docker-builds/) if you want to learn more about how ZenML builds these images and how you can customize them. +ZenML will build a Docker image called `/zenml:` which includes your code and use it to run your pipeline steps in Modal functions. Check out [this page](https://docs.zenml.io/concepts/containerization) if you want to learn more about how ZenML builds these images and how you can customize them. {% endhint %} You can now run any ZenML pipeline using the Modal orchestrator: @@ -249,6 +248,22 @@ modal_settings = ModalOrchestratorSettings( ) ``` +### Base image requirements + +{% hint style="info" %} +**Docker Image Customization** + +ZenML will automatically build a Docker image that includes your code and dependencies, then use it to run your pipeline on Modal. The base image and dependencies you configure will determine what's available in your Modal execution environment. + +Key considerations: +- **Base image**: Choose an appropriate base image for your workload (e.g., `python:3.9-slim`, `ubuntu:20.04`, or specialized ML images) +- **Dependencies**: Ensure all required packages are specified in your `requirements.txt` or Docker settings +- **System packages**: If you need system-level packages, configure them in your Docker settings +- **Environment variables**: Configure any necessary environment variables in your ZenML pipeline or Docker settings + +Check out the [ZenML Docker customization guide](https://docs.zenml.io/how-to/customize-docker-builds) for detailed information on customizing your execution environment. +{% endhint %} + ### Using GPUs Modal makes it easy to use GPUs for your ML workloads. Use `ResourceSettings` to specify the number of GPUs and `ModalOrchestratorSettings` to specify the GPU type: @@ -365,6 +380,31 @@ modal_settings = ModalOrchestratorSettings( ) ``` +### How it works: Modal Apps and Functions + +{% hint style="info" %} +**Implementation Details** + +The ZenML Modal orchestrator implements pipeline execution using Modal's app and function architecture: + +**Modal App Deployment**: +- ZenML creates a persistent Modal app with a unique name that includes a time window and build checksum +- The app stays deployed and available for a configurable time period (default: 2 hours) +- Apps are automatically reused within the time window if the Docker image hasn't changed +- This eliminates the overhead of redeploying apps for consecutive pipeline runs + +**Function Execution**: +- Your pipeline runs inside Modal functions within the deployed app +- In `pipeline` mode (default): The entire pipeline executes in a single function call +- In `per_step` mode: Each step runs as a separate function call within the same app +- Functions can maintain warm containers between executions for faster startup + +**Container Management**: +- Modal manages container lifecycle automatically based on your `min_containers` and `max_containers` settings +- Warm containers stay ready with your Docker image loaded and dependencies installed +- Cold containers are spun up on-demand when warm containers are unavailable +{% endhint %} + ### Warm containers for faster execution Modal orchestrator uses persistent apps with warm containers to minimize cold starts: @@ -386,19 +426,62 @@ def my_pipeline(): This ensures your pipelines start executing immediately without waiting for container initialization. -{% hint style="info" %} -**Cost Implications of Container Warming** +{% hint style="warning" %} +**Cost Implications and Optimization** + +Understanding Modal orchestrator costs helps optimize your spend: -Keeping warm containers (`min_containers > 0`) incurs costs even when your pipelines are not running, as Modal charges for idle container time. Consider these factors: +**Container Costs**: +- **Warm containers** (`min_containers > 0`): You pay for idle time even when pipelines aren't running +- **Cold containers**: Only pay when actually executing, but incur startup time (~30-60 seconds) +- **GPU containers**: Significantly more expensive than CPU-only containers for idle time -- **Development**: Use `min_containers=0` to minimize costs during development -- **Production**: Use `min_containers=1-2` for fast startup times on frequent workloads -- **Resource costs**: Warm containers with GPUs are significantly more expensive than CPU-only containers -- **Time window**: Containers are reused within a 2-hour window by default (configurable via `app_warming_window_hours`) +**App Deployment Costs**: +- **App reuse**: No additional cost when reusing apps within the time window (default: 2 hours) +- **New deployments**: Small deployment overhead for each new app (new time window or changed Docker image) -Monitor your Modal dashboard to track idle container costs and adjust settings based on your usage patterns. +**Execution Mode Costs**: +- **Pipeline mode**: Most cost-effective - single function call for entire pipeline +- **Per-step mode**: Higher cost due to multiple function calls, but better for debugging + +**Cost Optimization Strategies**: +- **Development**: Use `min_containers=0` to avoid idle costs +- **Production (frequent)**: Use `min_containers=1-2` for pipelines running multiple times per hour +- **Production (infrequent)**: Use `min_containers=0` for pipelines running less than once per hour +- **GPU workloads**: Be especially careful with `min_containers` due to high GPU idle costs +- **Time windows**: Adjust `app_warming_window_hours` based on your pipeline frequency + +Monitor your Modal dashboard to track container utilization and costs, then adjust settings accordingly. {% endhint %} +### App reuse and warming windows + +You can control how long Modal apps stay deployed and available for reuse: + +```python +modal_settings = ModalOrchestratorSettings( + app_warming_window_hours=4.0, # Keep apps deployed for 4 hours + min_containers=1, # Keep 1 container warm + max_containers=5 # Scale up to 5 containers +) + +@pipeline(settings={"orchestrator": modal_settings}) +def my_pipeline(): + # This pipeline will reuse the same Modal app if run within 4 hours + # and the Docker image hasn't changed + ... +``` + +**App Reuse Benefits**: +- **Faster execution**: No app deployment time for subsequent runs +- **Cost efficiency**: No repeated deployment overhead +- **Consistent environment**: Same app instance for related pipeline runs + +**When apps are recreated**: +- After the warming window expires (default: 2 hours) +- When the Docker image changes (new dependencies, code changes) +- When resource requirements change significantly + ## Best practices 1. **Use pipeline mode for production**: The default `pipeline` execution mode runs your entire pipeline in one function, minimizing overhead and cost. diff --git a/docs/book/component-guide/step-operators/modal.md b/docs/book/component-guide/step-operators/modal.md index 6a772916a12..8b7e307d905 100644 --- a/docs/book/component-guide/step-operators/modal.md +++ b/docs/book/component-guide/step-operators/modal.md @@ -8,11 +8,35 @@ description: Executing individual steps in Modal. ### When to use it +{% hint style="info" %} +**Modal Step Operator vs Orchestrator** + +ZenML offers both a [Modal step operator](modal.md) and a [Modal orchestrator](../orchestrators/modal.md). Choose based on your needs: + +- **Modal Step Operator**: Runs individual steps on Modal while keeping orchestration local. Best for selectively running compute-intensive steps (like training) on Modal while keeping other steps local. +- **Modal Orchestrator**: Runs entire pipelines on Modal's infrastructure. Best for complete pipeline execution with consistent resource requirements. + +Use the step operator for hybrid local/cloud workflows, use the orchestrator for full cloud execution. +{% endhint %} + You should use the Modal step operator if: -* You need fast execution time for steps that require computing resources (CPU, GPU, memory). -* You want to easily specify the exact hardware requirements (e.g., GPU type, CPU count, memory) for each step. -* You have access to Modal. +* You want to run only specific compute-intensive steps (like training or inference) on Modal while keeping other steps local. +* You need different hardware requirements for different steps in your pipeline. +* You want to leverage Modal's fast execution and GPU access for select steps without moving your entire pipeline to the cloud. +* You have a hybrid workflow where some steps need to access local resources or data. + +### When NOT to use it + +The Modal step operator may not be the best choice if: + +* **You want to run entire pipelines on Modal**: Use the [Modal orchestrator](../orchestrators/modal.md) instead for complete pipeline execution with better cost efficiency and reduced overhead. + +* **You have simple, lightweight steps**: For steps that don't require significant compute resources, the overhead of running them on Modal may not be worth it. + +* **You need very low latency**: The step operator introduces some overhead for individual step execution compared to running steps locally. + +* **You have tight data locality requirements**: If your steps need to access large amounts of local data, transferring it to Modal for each step execution may be inefficient. ### How to deploy it @@ -39,11 +63,29 @@ To use the Modal step operator, we need: We can then register the step operator: +**Option 1: Using Modal CLI authentication (recommended for development)** + ```shell +# Register the step operator (uses Modal CLI credentials) zenml step-operator register --flavor=modal zenml stack update -s ... ``` +**Option 2: Using Modal API token (recommended for production)** + +```shell +# Register the step operator with explicit credentials +zenml step-operator register \ + --flavor=modal \ + --token-id= \ + --token-secret= \ + --workspace= \ + --environment= +zenml stack update -s ... +``` + +You can get your Modal token from the [Modal dashboard](https://modal.com/settings/tokens). + Once you added the step operator to your active stack, you can use it to execute individual steps of your pipeline by specifying it in the `@step` decorator as follows: ```python @@ -62,6 +104,24 @@ ZenML will build a Docker image which includes your code and use it to run your #### Additional configuration +The Modal step operator uses two types of settings following ZenML's standard pattern: + +1. **`ResourceSettings`** (standard ZenML) - for hardware resource quantities: + - `cpu_count` - Number of CPU cores + - `memory` - Memory allocation (e.g., "16GB") + - `gpu_count` - Number of GPUs to allocate + +2. **`ModalStepOperatorSettings`** (Modal-specific) - for Modal platform configuration: + - `gpu` - GPU type specification (e.g., "T4", "A100", "H100") + - `region` - Cloud region preference + - `cloud` - Cloud provider selection + - `environment` - Modal environment name + - `timeout` - Maximum execution time in seconds + +{% hint style="info" %} +**GPU Configuration**: Use `ResourceSettings.gpu_count` to specify how many GPUs you need, and `ModalStepOperatorSettings.gpu` to specify what type of GPU. Modal will combine these automatically (e.g., `gpu_count=2` + `gpu="A100"` becomes `"A100:2"`). +{% endhint %} + You can specify the hardware requirements for each step using the `ResourceSettings` class as described in our documentation on [resource settings](https://docs.zenml.io/user-guides/tutorial/distributed-training): @@ -69,10 +129,20 @@ You can specify the hardware requirements for each step using the from zenml.config import ResourceSettings from zenml.integrations.modal.flavors import ModalStepOperatorSettings -modal_settings = ModalStepOperatorSettings(gpu="A100") +# Configure Modal-specific settings +modal_settings = ModalStepOperatorSettings( + gpu="A100", # GPU type (optional) + region="us-east-1", # Preferred region + cloud="aws", # Cloud provider + environment="production", # Modal environment + timeout=3600, # 1 hour timeout +) + +# Configure hardware resources (quantities) resource_settings = ResourceSettings( - cpu=2, - memory="32GB" + cpu_count=2, # Number of CPU cores + memory="32GB", # 32GB RAM + gpu_count=1 # Number of GPUs (combined with gpu type above) ) @step( @@ -83,11 +153,12 @@ resource_settings = ResourceSettings( } ) def my_modal_step(): + # This step will run on Modal with 1x A100 GPU, 2 CPU cores, and 32GB RAM ... ``` {% hint style="info" %} -Note that the `cpu` parameter in `ResourceSettings` currently only accepts a single integer value. This specifies a soft minimum limit - Modal will guarantee at least this many physical cores, but the actual usage could be higher. The CPU cores/hour will also determine the minimum price paid for the compute resources. +Note that the `cpu_count` parameter in `ResourceSettings` specifies a soft minimum limit - Modal will guarantee at least this many physical cores, but the actual usage could be higher. The CPU cores/hour will also determine the minimum price paid for the compute resources. For example, with the configuration above (2 CPUs and 32GB memory), the minimum cost would be approximately $1.03 per hour ((0.135 * 2) + (0.024 * 32) = $1.03). {% endhint %} @@ -100,6 +171,45 @@ full list of supported GPU types and the [SDK docs](https://sdkdocs.zenml.io/latest/integration\_code\_docs/integrations-modal/#zenml.integrations.modal.flavors.modal\_step\_operator\_flavor.ModalStepOperatorSettings) for more details on the available settings. +### Authentication with different environments + +{% hint style="info" %} +**Best Practice: Separate Step Operators for Different Environments** + +Consider creating separate ZenML step operators for different environments (development, staging, production), each configured with different Modal environments and workspaces. This provides better isolation and allows for different resource configurations per environment. + +For example: +- **Development step operator**: Uses Modal "dev" environment with smaller resource limits +- **Production step operator**: Uses Modal "production" environment with production-grade resources and credentials + +This approach helps prevent accidental deployment to production and allows for environment-specific configurations. +{% endhint %} + +For production deployments, you can specify different Modal environments and workspaces: + +```python +modal_settings = ModalStepOperatorSettings( + environment="production", # or "staging", "dev", etc. + workspace="my-company", # Modal workspace name + gpu="A100", + region="us-east-1", + cloud="aws" +) +``` + +Or configure them when registering the step operator: + +```shell +zenml step-operator register modal_prod \ + --flavor=modal \ + --token-id= \ + --token-secret= \ + --workspace="production-workspace" \ + --environment="production" +``` + +### Resource configuration notes + The settings do allow you to specify the region and cloud provider, but these settings are only available for Modal Enterprise and Team plan customers. Moreover, certain combinations of settings are not available. It is suggested to @@ -109,6 +219,70 @@ detailed error messages that can help identify what is incompatible. See more in the [Modal docs on region selection](https://modal.com/docs/guide/region-selection) for more details. +### Available GPU types + +Modal supports various GPU types for different workloads: + +- `T4` - Cost-effective for inference and light training +- `A10G` - Balanced performance for training and inference +- `A100` - High-performance for large model training +- `H100` - Latest generation for maximum performance + +**Examples of GPU configurations:** + +```python +# Single GPU step +@step( + settings={ + "step_operator": ModalStepOperatorSettings(gpu="A100"), + "resources": ResourceSettings(gpu_count=1) + } +) +def train_model(): + # Uses 1x A100 GPU + ... + +# Multiple GPU step +@step( + settings={ + "step_operator": ModalStepOperatorSettings(gpu="A100"), + "resources": ResourceSettings(gpu_count=4) + } +) +def distributed_training(): + # Uses 4x A100 GPUs + ... +``` + +### Base image requirements + +{% hint style="info" %} +**Docker Image Customization** + +ZenML will automatically build a Docker image that includes your code and dependencies, then use it to run your steps on Modal. The base image and dependencies you configure will determine what's available in your Modal execution environment. + +Key considerations: +- **Base image**: Choose an appropriate base image for your workload (e.g., `python:3.9-slim`, `ubuntu:20.04`, or specialized ML images) +- **Dependencies**: Ensure all required packages are specified in your `requirements.txt` or Docker settings +- **System packages**: If you need system-level packages, configure them in your Docker settings +- **Environment variables**: Configure any necessary environment variables in your ZenML step or Docker settings + +Check out the [ZenML Docker customization guide](https://docs.zenml.io/how-to/customize-docker-builds) for detailed information on customizing your execution environment. +{% endhint %} + +{% hint style="warning" %} +**Base Image Requirements for GPU Usage** + +When using GPUs, ensure your base Docker image includes the appropriate CUDA runtime and drivers. Modal's GPU instances come with CUDA pre-installed, but your application dependencies (like PyTorch, TensorFlow) must be compatible with the CUDA version. + +For optimal GPU performance: +- Use CUDA-compatible base images (e.g., `nvidia/cuda:11.8-runtime-ubuntu20.04`) +- Install GPU-compatible versions of ML frameworks in your Docker requirements +- Test your GPU setup locally before deploying to Modal + +ZenML will use your base image configuration from the container registry, so ensure GPU compatibility is built into your image. +{% endhint %} +
ZenML Scarf
From 331a164f129d439d13068ae4c49060567878a4db Mon Sep 17 00:00:00 2001 From: Safoine El Khabich <34200873+safoinme@users.noreply.github.com> Date: Tue, 17 Jun 2025 08:41:09 +0100 Subject: [PATCH 21/77] Update src/zenml/integrations/modal/utils.py --- src/zenml/integrations/modal/utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 5f0885fbffa..021c950d675 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -80,17 +80,14 @@ def _stream_logs(self) -> None: "--timestamps", ] self.logger.debug(f"Starting log stream: {' '.join(cmd)}") - - process = subprocess.Popen( + with subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, # Line buffered universal_newlines=True, - ) - - # Stream logs line by line + ) as process: while self.log_stream_active.is_set() and process.poll() is None: if process.stdout: line = process.stdout.readline() From 3173f6aee47de8dd54506053ec3ea3fe63c2d34f Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 23 Jun 2025 21:51:37 +0200 Subject: [PATCH 22/77] Apply suggestions from code review Co-authored-by: Michael Schuster --- .../integrations/modal/flavors/modal_orchestrator_flavor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 8cc63370868..326839e482c 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -81,7 +81,7 @@ class ModalOrchestratorSettings(BaseSettings): class ModalOrchestratorConfig( BaseOrchestratorConfig, ModalOrchestratorSettings ): - """Modal orchestrator config optimized for BLAZING FAST execution. + """Modal orchestrator config. Attributes: token_id: Modal API token ID (ak-xxxxx format) for authentication. From 37d4a57c618b53a1fbe5af875ad51d1a07d97c2d Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 23 Jun 2025 22:17:57 +0200 Subject: [PATCH 23/77] Update environment variable names to be consistent.- Rename `environment` to `modal_environment` in settings and configs.- Update references and usage correspondingly.- Refactor logging in `ModalLogStreamer` class. --- .../flavors/modal_orchestrator_flavor.py | 12 +- .../flavors/modal_step_operator_flavor.py | 8 +- .../modal/orchestrators/modal_orchestrator.py | 177 +++--------------- .../step_operators/modal_step_operator.py | 2 +- src/zenml/integrations/modal/utils.py | 50 ++--- 5 files changed, 60 insertions(+), 189 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 326839e482c..808d1537d12 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -48,7 +48,7 @@ class ModalOrchestratorSettings(BaseSettings): Use ResourceSettings.gpu_count to specify the number of GPUs. region: The region to use for the pipeline execution. cloud: The cloud provider to use for the pipeline execution. - environment: The Modal environment to use for the pipeline execution. + modal_environment: The Modal environment to use for the pipeline execution. timeout: Maximum execution time in seconds (default 24h). min_containers: Minimum containers to keep warm (replaces keep_warm). max_containers: Maximum concurrent containers (replaces concurrency_limit). @@ -61,7 +61,7 @@ class ModalOrchestratorSettings(BaseSettings): gpu: Optional[str] = None region: Optional[str] = None cloud: Optional[str] = None - environment: Optional[str] = None + modal_environment: Optional[str] = None timeout: int = 86400 # 24 hours (Modal's maximum) min_containers: Optional[int] = ( 1 # Keep 1 container warm for sequential execution @@ -87,7 +87,7 @@ class ModalOrchestratorConfig( token_id: Modal API token ID (ak-xxxxx format) for authentication. token_secret: Modal API token secret (as-xxxxx format) for authentication. workspace: Modal workspace name (optional). - environment: Modal environment name (optional). + modal_environment: Modal environment name (optional). Note: If token_id and token_secret are not provided, falls back to Modal's default authentication (~/.modal.toml). @@ -96,7 +96,7 @@ class ModalOrchestratorConfig( token_id: Optional[str] = SecretField(default=None) token_secret: Optional[str] = SecretField(default=None) workspace: Optional[str] = None - environment: Optional[str] = None + modal_environment: Optional[str] = None @property def is_remote(self) -> bool: @@ -112,9 +112,9 @@ def is_synchronous(self) -> bool: """Whether the orchestrator runs synchronous or not. Returns: - True since the orchestrator waits for completion. + Whether the orchestrator runs synchronous or not. """ - return True + return self.synchronous class ModalOrchestratorFlavor(BaseOrchestratorFlavor): diff --git a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py index 89d103233f3..53348756070 100644 --- a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py @@ -41,14 +41,14 @@ class ModalStepOperatorSettings(BaseSettings): Use ResourceSettings.gpu_count to specify the number of GPUs. region: The region to use for the step execution. cloud: The cloud provider to use for the step execution. - environment: The Modal environment to use for the step execution. + modal_environment: The Modal environment to use for the step execution. timeout: Maximum execution time in seconds (default 24h). """ gpu: Optional[str] = None region: Optional[str] = None cloud: Optional[str] = None - environment: Optional[str] = None + modal_environment: Optional[str] = None timeout: int = 86400 # 24 hours (Modal's maximum) @@ -61,7 +61,7 @@ class ModalStepOperatorConfig( token_id: Modal API token ID (ak-xxxxx format) for authentication. token_secret: Modal API token secret (as-xxxxx format) for authentication. workspace: Modal workspace name (optional). - environment: Modal environment name (optional). + modal_environment: Modal environment name (optional). Note: If token_id and token_secret are not provided, falls back to Modal's default authentication (~/.modal.toml). @@ -70,7 +70,7 @@ class ModalStepOperatorConfig( token_id: Optional[str] = SecretField(default=None) token_secret: Optional[str] = SecretField(default=None) workspace: Optional[str] = None - environment: Optional[str] = None + modal_environment: Optional[str] = None @property def is_remote(self) -> bool: diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 5b66fcabdc4..4eaf3006ddb 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -14,14 +14,13 @@ """Implementation of a Modal orchestrator.""" import os -import sys import time import traceback from typing import ( TYPE_CHECKING, Any, Dict, - List, + Iterator, Optional, Tuple, Type, @@ -29,23 +28,11 @@ ) from uuid import uuid4 -try: - import modal -except ImportError: - modal = None # type: ignore - from zenml.config.base_settings import BaseSettings -from zenml.config.build_configuration import BuildConfiguration from zenml.config.constants import RESOURCE_SETTINGS_KEY from zenml.entrypoints.pipeline_entrypoint_configuration import ( PipelineEntrypointConfiguration, ) -from zenml.entrypoints.step_entrypoint_configuration import ( - StepEntrypointConfiguration, -) -from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( - ModalExecutionMode, -) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, build_modal_image, @@ -58,6 +45,7 @@ stream_modal_logs_and_wait, ) from zenml.logger import get_logger +from zenml.metadata.metadata_types import MetadataType from zenml.orchestrators import ContainerizedOrchestrator from zenml.stack import Stack, StackValidator from zenml.utils import string_utils @@ -68,63 +56,10 @@ ModalOrchestratorSettings, ) from zenml.models import PipelineDeploymentResponse, PipelineRunResponse - from zenml.models.v2.core.pipeline_deployment import PipelineDeploymentBase logger = get_logger(__name__) -def run_step_in_modal( - step_name: str, - deployment_id: str, - orchestrator_run_id: str, -) -> None: - """Execute a single ZenML step in Modal. - - Args: - step_name: Name of the step to execute. - deployment_id: ID of the pipeline deployment. - orchestrator_run_id: ID of the orchestrator run. - - Raises: - Exception: If step execution fails. - """ - # Get pipeline run ID for debugging/logging - pipeline_run_id = os.environ.get("ZENML_PIPELINE_RUN_ID", "unknown") - - logger.info( - f"Running step '{step_name}' remotely (pipeline run: {pipeline_run_id})" - ) - sys.stdout.flush() - - # Set the orchestrator run ID in the Modal environment - os.environ["ZENML_MODAL_ORCHESTRATOR_RUN_ID"] = orchestrator_run_id - - try: - logger.info( - f"Executing step '{step_name}' directly in process for maximum speed" - ) - sys.stdout.flush() - - # Create the entrypoint arguments - args = StepEntrypointConfiguration.get_entrypoint_arguments( - step_name=step_name, deployment_id=deployment_id - ) - - # Create the configuration and run the step - config = StepEntrypointConfiguration(arguments=args) - config.run() - - logger.info(f"Step {step_name} completed successfully") - sys.stdout.flush() - - except Exception as e: - error_details = traceback.format_exc() - logger.error(f"Error executing step {step_name}: {e}") - logger.debug(f"Full traceback:\n{error_details}") - sys.stdout.flush() - raise - - def run_entire_pipeline( deployment_id: str, orchestrator_run_id: str, @@ -138,20 +73,8 @@ def run_entire_pipeline( Raises: Exception: If pipeline execution fails. """ - # Get pipeline run ID for debugging/logging - pipeline_run_id = os.environ.get("ZENML_PIPELINE_RUN_ID", "unknown") - - logger.info( - "Starting entire pipeline using PipelineEntrypointConfiguration", - extra={ - "deployment_id": deployment_id, - "orchestrator_run_id": orchestrator_run_id, - "pipeline_run_id": pipeline_run_id, - }, - ) - # Set the orchestrator run ID in the Modal environment - os.environ["ZENML_MODAL_ORCHESTRATOR_RUN_ID"] = orchestrator_run_id + os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id try: logger.debug("Initializing pipeline entrypoint configuration") @@ -212,7 +135,7 @@ def _setup_modal_client(self) -> None: token_id=self.config.token_id, token_secret=self.config.token_secret, workspace=self.config.workspace, - environment=self.config.environment, + environment=self.config.modal_environment, ) @property @@ -243,21 +166,6 @@ def get_orchestrator_run_id(self) -> str: f"{ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID}." ) - def get_docker_builds( - self, deployment: "PipelineDeploymentBase" - ) -> List[BuildConfiguration]: - """Get the Docker build configurations for the Modal orchestrator. - - Args: - deployment: The pipeline deployment. - - Returns: - A list of Docker build configurations. - """ - # Use the standard containerized orchestrator build logic - # This ensures ZenML builds the image with all pipeline code - return super().get_docker_builds(deployment) - def _build_modal_image( self, deployment: "PipelineDeploymentResponse", @@ -285,7 +193,7 @@ def prepare_or_run_pipeline( stack: "Stack", environment: Dict[str, str], placeholder_run: Optional["PipelineRunResponse"] = None, - ) -> Any: + ) -> Optional[Iterator[Dict[str, MetadataType]]]: """Runs the complete pipeline in a single Modal function. Args: @@ -296,14 +204,8 @@ def prepare_or_run_pipeline( placeholder_run: An optional placeholder run for the deployment (unused). Raises: - RuntimeError: If Modal is not installed or if a step fails. Exception: If pipeline execution fails. """ - _ = placeholder_run # Mark as intentionally unused - if modal is None: - raise RuntimeError( - "Required dependencies not installed. Please install with: pip install modal" - ) if deployment.schedule: logger.warning( "Serverless Orchestrator currently does not support the " @@ -317,14 +219,8 @@ def prepare_or_run_pipeline( # Generate orchestrator run ID and include pipeline run ID for isolation orchestrator_run_id = str(uuid4()) - # Include pipeline run ID to prevent conflicts when same container - # handles multiple pipeline runs rapidly - pipeline_run_id = placeholder_run.id if placeholder_run else "unknown" - environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id - environment["ZENML_PIPELINE_RUN_ID"] = str(pipeline_run_id) - logger.debug(f"Pipeline run ID: {pipeline_run_id}") logger.debug(f"Orchestrator run ID: {orchestrator_run_id}") # Get settings from pipeline configuration (applies to entire pipeline) @@ -351,29 +247,14 @@ def prepare_or_run_pipeline( "Starting pipeline execution with persistent serverless functions" ) - step_names = list(deployment.step_configurations.keys()) - logger.debug(f"Found {len(step_names)} steps: {step_names}") - - # Create the execution function based on execution mode - execution_mode = settings.execution_mode or self.config.execution_mode - if execution_mode == ModalExecutionMode.PER_STEP: - logger.debug("Creating per-step mode for granular execution") - execution_func: Any = run_step_in_modal - function_name = "run_step_in_modal" - else: - logger.debug("Creating pipeline mode for maximum speed") - execution_func = run_entire_pipeline - function_name = "run_entire_pipeline" - - # Get or deploy persistent Modal app with warm containers - mode_suffix = execution_mode.value.replace("_", "-") - app_name_base = f"zenml-{deployment.pipeline_configuration.name.replace('_', '-')}-{mode_suffix}" + # Use simplified app naming based on deployment ID + app_name_base = f"zenml-pipeline-{deployment.id}" execute_step, full_app_name = get_or_deploy_persistent_modal_app( app_name_base=app_name_base, zenml_image=zenml_image, - execution_func=execution_func, - function_name=function_name, + execution_func=run_entire_pipeline, + function_name="run_entire_pipeline", deployment=deployment, gpu_values=gpu_values, cpu_count=cpu_count, # Use ResourceSettings value or None (Modal default) @@ -385,8 +266,8 @@ def prepare_or_run_pipeline( or self.config.min_containers, max_containers=settings.max_containers or self.config.max_containers, - environment_name=settings.environment - or self.config.environment, # Use environment from config/settings + environment_name=settings.modal_environment + or self.config.modal_environment, # Use modal_environment from config/settings app_warming_window_hours=settings.app_warming_window_hours or self.config.app_warming_window_hours, ) @@ -430,30 +311,16 @@ def execute_modal_function( ) return function_call - if execution_mode == ModalExecutionMode.PER_STEP: - logger.info("Using per-step mode for granular execution") - # Execute steps individually - for step_name in step_names: - try: - execute_modal_function( - (step_name, deployment.id, orchestrator_run_id), - f"Step '{step_name}' execution", - ) - except Exception as e: - logger.error(f"Step '{step_name}' failed: {e}") - logger.info("Check platform dashboard for detailed logs") - raise - else: - # Default: execute entire pipeline in one function - try: - execute_modal_function( - (deployment.id, orchestrator_run_id), - "Pipeline execution", - ) - except Exception as e: - logger.error(f"Pipeline failed: {e}") - logger.info("Check platform dashboard for detailed logs") - raise + # Execute entire pipeline in one function + try: + execute_modal_function( + (deployment.id, orchestrator_run_id), + "Pipeline execution", + ) + except Exception as e: + logger.error(f"Pipeline failed: {e}") + logger.info("Check platform dashboard for detailed logs") + raise run_duration = time.time() - start_time @@ -462,3 +329,5 @@ def execute_modal_function( "Pipeline run has finished in `%s`.", string_utils.get_human_readable_time(run_duration), ) + + return None diff --git a/src/zenml/integrations/modal/step_operators/modal_step_operator.py b/src/zenml/integrations/modal/step_operators/modal_step_operator.py index aab59bfe409..2e6377f81ef 100644 --- a/src/zenml/integrations/modal/step_operators/modal_step_operator.py +++ b/src/zenml/integrations/modal/step_operators/modal_step_operator.py @@ -126,7 +126,7 @@ def launch( token_id=self.config.token_id, token_secret=self.config.token_secret, workspace=self.config.workspace, - environment=self.config.environment, + environment=self.config.modal_environment, ) # Build Modal image using shared utility diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 021c950d675..b90a4c81be3 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -88,30 +88,32 @@ def _stream_logs(self) -> None: bufsize=1, # Line buffered universal_newlines=True, ) as process: - while self.log_stream_active.is_set() and process.poll() is None: - if process.stdout: - line = process.stdout.readline() - if line: - # Clean up the log line and forward to our logger - log_msg = line.strip() - - # Skip empty lines and "No logs" messages - if not log_msg or log_msg.startswith("No logs"): - continue - - # Filter logs based on function call ID when available - # If no call ID, show all logs (Modal CLI handles recency) - if self.call_id and self.call_id in log_msg: - # This log definitely belongs to our execution - self.logger.info(f"{log_msg}") - elif not self.call_id: - # No call ID available, show all logs from Modal CLI stream - # Modal CLI already filters for recent/relevant logs - self.logger.info(f"{log_msg}") - # Else: skip logs that don't match our call ID - - else: - break + while ( + self.log_stream_active.is_set() and process.poll() is None + ): + if process.stdout: + line = process.stdout.readline() + if line: + # Clean up the log line and forward to our logger + log_msg = line.strip() + + # Skip empty lines and "No logs" messages + if not log_msg or log_msg.startswith("No logs"): + continue + + # Filter logs based on function call ID when available + # If no call ID, show all logs (Modal CLI handles recency) + if self.call_id and self.call_id in log_msg: + # This log definitely belongs to our execution + self.logger.info(f"{log_msg}") + elif not self.call_id: + # No call ID available, show all logs from Modal CLI stream + # Modal CLI already filters for recent/relevant logs + self.logger.info(f"{log_msg}") + # Else: skip logs that don't match our call ID + + else: + break # Clean up process if process.poll() is None: From e8b6b42ef6fbe6b38eabec2333b26215d67eaff9 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 23 Jun 2025 22:27:48 +0200 Subject: [PATCH 24/77] Refactor ModalOrchestrator app naming for image builds --- .../modal/orchestrators/modal_orchestrator.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 4eaf3006ddb..cc8c397e2dd 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -13,6 +13,7 @@ # permissions and limitations under the License. """Implementation of a Modal orchestrator.""" +import hashlib import os import time import traceback @@ -205,6 +206,9 @@ def prepare_or_run_pipeline( Raises: Exception: If pipeline execution fails. + + Returns: + None if the pipeline is executed synchronously, otherwise an iterator of metadata dictionaries. """ if deployment.schedule: logger.warning( @@ -247,8 +251,21 @@ def prepare_or_run_pipeline( "Starting pipeline execution with persistent serverless functions" ) - # Use simplified app naming based on deployment ID - app_name_base = f"zenml-pipeline-{deployment.id}" + # Use image-based app naming for build-specific persistence + # This allows multiple pipelines with the same image/build to reuse the same Modal app + # Different builds get different apps, ensuring proper dependency isolation + image_name = self.get_image(deployment=deployment) + # Create a safe app name from the image name (hash if too long) + safe_image_name = ( + image_name.replace("/", "-").replace(":", "-").replace(".", "-") + ) + if ( + len(safe_image_name) > 50 + ): # Modal app names should be reasonable length + # Use hash to ensure uniqueness while keeping reasonable length + image_hash = hashlib.md5(image_name.encode()).hexdigest()[:8] + safe_image_name = f"hashed-{image_hash}" + app_name_base = f"zenml-{safe_image_name}" execute_step, full_app_name = get_or_deploy_persistent_modal_app( app_name_base=app_name_base, From 32a56dfce9fe1ed2b98c451d40ffc14046b01da8 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 23 Jun 2025 23:05:44 +0200 Subject: [PATCH 25/77] Use orchestrator run ID for complete isolation during testing --- .../modal/orchestrators/modal_orchestrator.py | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index cc8c397e2dd..ec195d1b29b 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -13,7 +13,6 @@ # permissions and limitations under the License. """Implementation of a Modal orchestrator.""" -import hashlib import os import time import traceback @@ -79,6 +78,8 @@ def run_entire_pipeline( try: logger.debug("Initializing pipeline entrypoint configuration") + logger.debug(f"Deployment ID: {deployment_id}") + logger.debug(f"Orchestrator Run ID: {orchestrator_run_id}") # Create the entrypoint arguments args = PipelineEntrypointConfiguration.get_entrypoint_arguments( @@ -225,6 +226,11 @@ def prepare_or_run_pipeline( environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id + # Pass pipeline run ID for proper isolation (following other orchestrators' pattern) + if placeholder_run: + environment["ZENML_PIPELINE_RUN_ID"] = str(placeholder_run.id) + logger.debug(f"Pipeline run ID: {placeholder_run.id}") + logger.debug(f"Orchestrator run ID: {orchestrator_run_id}") # Get settings from pipeline configuration (applies to entire pipeline) @@ -251,21 +257,9 @@ def prepare_or_run_pipeline( "Starting pipeline execution with persistent serverless functions" ) - # Use image-based app naming for build-specific persistence - # This allows multiple pipelines with the same image/build to reuse the same Modal app - # Different builds get different apps, ensuring proper dependency isolation - image_name = self.get_image(deployment=deployment) - # Create a safe app name from the image name (hash if too long) - safe_image_name = ( - image_name.replace("/", "-").replace(":", "-").replace(".", "-") - ) - if ( - len(safe_image_name) > 50 - ): # Modal app names should be reasonable length - # Use hash to ensure uniqueness while keeping reasonable length - image_hash = hashlib.md5(image_name.encode()).hexdigest()[:8] - safe_image_name = f"hashed-{image_hash}" - app_name_base = f"zenml-{safe_image_name}" + # TEMPORARY: Use orchestrator run ID for complete isolation during testing + # TODO: Revert to image-based naming once step conflict issue is resolved + app_name_base = f"zenml-run-{orchestrator_run_id}" execute_step, full_app_name = get_or_deploy_persistent_modal_app( app_name_base=app_name_base, From 7616eefb5e4e65754e9b2faf9b3318ff765d0f97 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 23 Jun 2025 23:39:40 +0200 Subject: [PATCH 26/77] Refactor modal orchestrator for new app-function architecture --- .../modal/orchestrators/modal_orchestrator.py | 18 ++++++++++++---- src/zenml/integrations/modal/utils.py | 21 +++++-------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index ec195d1b29b..4078c14ea43 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -13,6 +13,7 @@ # permissions and limitations under the License. """Implementation of a Modal orchestrator.""" +import hashlib import os import time import traceback @@ -257,15 +258,24 @@ def prepare_or_run_pipeline( "Starting pipeline execution with persistent serverless functions" ) - # TEMPORARY: Use orchestrator run ID for complete isolation during testing - # TODO: Revert to image-based naming once step conflict issue is resolved - app_name_base = f"zenml-run-{orchestrator_run_id}" + # NEW ARCHITECTURE: App = Pipeline (persistent), Function = Build (isolated) + # App stays warm per pipeline, functions handle different builds within the app + pipeline_name = deployment.pipeline_configuration.name.replace( + "_", "-" + ) + app_name_base = f"zenml-pipeline-{pipeline_name}" + + # Function name based on build + run for complete isolation + image_name = self.get_image(deployment=deployment) + image_hash = hashlib.md5(image_name.encode()).hexdigest()[:8] + run_suffix = orchestrator_run_id[-8:] # Last 8 chars of run ID + function_name = f"run_build_{image_hash}_{run_suffix}" execute_step, full_app_name = get_or_deploy_persistent_modal_app( app_name_base=app_name_base, zenml_image=zenml_image, execution_func=run_entire_pipeline, - function_name="run_entire_pipeline", + function_name=function_name, deployment=deployment, gpu_values=gpu_values, cpu_count=cpu_count, # Use ResourceSettings value or None (Modal default) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index b90a4c81be3..f206fcb8fcc 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -493,22 +493,11 @@ def get_or_deploy_persistent_modal_app( f"Found existing app '{app_name}' with matching build and fresh time window - reusing warm containers" ) - # Try to get the function directly - try: - existing_function = modal.Function.from_name( - app_name, - function_name, - environment_name=environment_name or "main", - ) - logger.debug( - "Successfully retrieved function from existing app" - ) - return existing_function, app_name - except Exception as func_error: - logger.warning( - f"Function lookup failed: {func_error}, redeploying" - ) - # Fall through to deployment + # For the app=pipeline, function=build architecture, we always redeploy + # to ensure fresh function deployment even if app exists + logger.info( + f"App exists but redeploying to ensure fresh function '{function_name}' deployment" + ) except Exception: # App not found or other lookup error - deploy fresh app From 3e7fbc59a29c849a06fffee32620b48cc25dcc30 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 23 Jun 2025 23:42:47 +0200 Subject: [PATCH 27/77] Refactor Modal orchestrator for better efficiency --- .../component-guide/orchestrators/modal.md | 78 +++++++++---------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index c46022c94c5..3c84aabdbd1 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -6,7 +6,7 @@ description: Orchestrating your pipelines to run on Modal's serverless cloud pla Using the ZenML `modal` integration, you can orchestrate and scale your ML pipelines on [Modal's](https://modal.com/) serverless cloud platform with minimal setup and maximum efficiency. -The Modal orchestrator is designed for speed and cost-effectiveness, running entire pipelines in single serverless functions to minimize cold starts and optimize resource utilization. +The Modal orchestrator is designed for speed and cost-effectiveness, running entire pipelines using an intelligent app persistence strategy that reuses warm containers while ensuring proper isolation between different builds. {% hint style="warning" %} This component is only meant to be used within the context of a [remote ZenML deployment scenario](https://docs.zenml.io/getting-started/deploying-zenml/). Usage with a local ZenML deployment may lead to unexpected behavior! @@ -26,7 +26,7 @@ You should use the Modal orchestrator if: The Modal orchestrator may not be the best choice if: -* **You need fine-grained step isolation**: Modal runs entire pipelines in single functions by default, which means all steps share the same resources and environment. For pipelines requiring different resource configurations per step, consider the [Modal step operator](../step-operators/modal.md) instead. +* **You need fine-grained step isolation**: Modal runs entire pipelines in single functions, which means all steps share the same resources and environment. For pipelines requiring different resource configurations per step, consider the [Modal step operator](../step-operators/modal.md) instead. * **You have strict data locality requirements**: Modal runs in specific cloud regions and may not be suitable if you need to keep data processing within specific geographic boundaries or on-premises. @@ -133,8 +133,9 @@ The Modal orchestrator uses two types of settings following ZenML's standard pat - `gpu` - GPU type specification (e.g., "T4", "A100", "H100") - `region` - Cloud region preference - `cloud` - Cloud provider selection - - `execution_mode` - How to run the pipeline + - `modal_environment` - Modal environment name (e.g., "main", "dev", "prod") - `timeout`, `min_containers`, `max_containers` - Performance settings + - `synchronous` - Wait for completion (True) or fire-and-forget (False) {% hint style="info" %} **GPU Configuration**: Use `ResourceSettings.gpu_count` to specify how many GPUs you need, and `ModalOrchestratorSettings.gpu` to specify what type of GPU. Modal will combine these automatically (e.g., `gpu_count=2` + `gpu="A100"` becomes `"A100:2"`). @@ -155,10 +156,11 @@ modal_settings = ModalOrchestratorSettings( gpu="A100", # GPU type (optional) region="us-east-1", # Preferred region cloud="aws", # Cloud provider - execution_mode="pipeline", # or "per_step" + modal_environment="main", # Modal environment name timeout=3600, # 1 hour timeout min_containers=1, # Keep warm containers max_containers=10, # Scale up to 10 containers + synchronous=True, # Wait for completion ) # Configure hardware resources (quantities) @@ -225,28 +227,19 @@ def second_step(): ... ``` -### Execution modes +### App Persistence Architecture -The Modal orchestrator supports two execution modes: +The Modal orchestrator uses an intelligent app persistence strategy: -1. **`pipeline` (default)**: Runs the entire pipeline in a single Modal function for minimal overhead and cost efficiency. Steps execute sequentially with no cold starts or function call overhead between them. -2. **`per_step`**: Runs each step in a separate Modal function call for granular control and debugging. Better for pipelines where steps can run in parallel or have very different resource requirements. +- **Apps are persistent per pipeline**: Each pipeline gets its own Modal app that stays warm +- **Functions are unique per build/run**: Different builds get separate functions for proper isolation +- **Automatic reuse**: Same pipeline with same dependencies reuses warm apps +- **Smart isolation**: Different dependencies trigger new deployments for safety -{% hint style="info" %} -**Resource Sharing**: Both execution modes use the same Modal function with the same resource configuration (from pipeline-level settings). The difference is whether steps run sequentially in one function call (`pipeline`) or as separate function calls (`per_step`). -{% endhint %} - -```python -# Fast execution (default) - entire pipeline in one function -modal_settings = ModalOrchestratorSettings( - execution_mode="pipeline" -) - -# Granular execution - each step separate (useful for debugging) -modal_settings = ModalOrchestratorSettings( - execution_mode="per_step" -) -``` +This architecture provides the best of both worlds: +- **Performance**: Warm containers eliminate cold start delays +- **Isolation**: Different builds don't interfere with each other +- **Cost efficiency**: Apps are reused when safe to do so ### Base image requirements @@ -375,34 +368,39 @@ For production deployments, you can specify different Modal environments: ```python modal_settings = ModalOrchestratorSettings( - environment="production", # or "staging", "dev", etc. + modal_environment="production", # or "staging", "dev", etc. workspace="my-company" ) ``` -### How it works: Modal Apps and Functions +### How it works: App = Pipeline, Function = Build {% hint style="info" %} -**Implementation Details** +**Smart Architecture for Performance and Isolation** + +The ZenML Modal orchestrator uses an innovative "App = Pipeline, Function = Build" architecture: -The ZenML Modal orchestrator implements pipeline execution using Modal's app and function architecture: +**Pipeline-Level Apps**: +- Each pipeline gets its own persistent Modal app (e.g., `zenml-pipeline-training-pipeline`) +- Apps stay warm and reusable across multiple runs of the same pipeline +- App names are stable, enabling long-term container warmth -**Modal App Deployment**: -- ZenML creates a persistent Modal app with a unique name that includes a time window and build checksum -- The app stays deployed and available for a configurable time period (default: 2 hours) -- Apps are automatically reused within the time window if the Docker image hasn't changed -- This eliminates the overhead of redeploying apps for consecutive pipeline runs +**Build-Specific Functions**: +- Each unique build/dependency combination gets its own function within the app +- Function names include build hashes to ensure isolation (e.g., `run_build_abc123_def456`) +- Different dependencies = different functions = proper isolation +- Same dependencies = same function = maximum reuse -**Function Execution**: -- Your pipeline runs inside Modal functions within the deployed app -- In `pipeline` mode (default): The entire pipeline executes in a single function call -- In `per_step` mode: Each step runs as a separate function call within the same app -- Functions can maintain warm containers between executions for faster startup +**Execution Flow**: +- Your entire pipeline runs in a single function call using `PipelineEntrypoint` +- Maximum speed with minimal overhead +- Warm containers provide near-instant startup +- Fresh execution context prevents conflicts between runs **Container Management**: -- Modal manages container lifecycle automatically based on your `min_containers` and `max_containers` settings -- Warm containers stay ready with your Docker image loaded and dependencies installed -- Cold containers are spun up on-demand when warm containers are unavailable +- Modal manages container lifecycle based on your `min_containers` and `max_containers` settings +- Warm containers stay ready with your Docker image and dependencies loaded +- Apps persist across runs, functions are deployed fresh when needed {% endhint %} ### Warm containers for faster execution From 5f22bb0af5056717038f0986a3d724b60070f1ff Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 08:30:53 +0200 Subject: [PATCH 28/77] Update environment variable naming convention in Modal step operator --- docs/book/component-guide/step-operators/modal.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/book/component-guide/step-operators/modal.md b/docs/book/component-guide/step-operators/modal.md index 8b7e307d905..1d0bd36fa0b 100644 --- a/docs/book/component-guide/step-operators/modal.md +++ b/docs/book/component-guide/step-operators/modal.md @@ -80,7 +80,7 @@ zenml step-operator register \ --token-id= \ --token-secret= \ --workspace= \ - --environment= + --modal-environment= zenml stack update -s ... ``` @@ -115,7 +115,7 @@ The Modal step operator uses two types of settings following ZenML's standard pa - `gpu` - GPU type specification (e.g., "T4", "A100", "H100") - `region` - Cloud region preference - `cloud` - Cloud provider selection - - `environment` - Modal environment name + - `modal_environment` - Modal environment name - `timeout` - Maximum execution time in seconds {% hint style="info" %} @@ -134,7 +134,7 @@ modal_settings = ModalStepOperatorSettings( gpu="A100", # GPU type (optional) region="us-east-1", # Preferred region cloud="aws", # Cloud provider - environment="production", # Modal environment + modal_environment="production", # Modal environment timeout=3600, # 1 hour timeout ) @@ -189,7 +189,7 @@ For production deployments, you can specify different Modal environments and wor ```python modal_settings = ModalStepOperatorSettings( - environment="production", # or "staging", "dev", etc. + modal_environment="production", # or "staging", "dev", etc. workspace="my-company", # Modal workspace name gpu="A100", region="us-east-1", @@ -205,7 +205,7 @@ zenml step-operator register modal_prod \ --token-id= \ --token-secret= \ --workspace="production-workspace" \ - --environment="production" + --modal-environment="production" ``` ### Resource configuration notes From 376d7e47c836f8b004b1a4dbcb41058bb453b465 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 08:50:12 +0200 Subject: [PATCH 29/77] Update Modal orchestrator to use sandbox architecture --- .../component-guide/orchestrators/modal.md | 137 +++----- .../modal/orchestrators/modal_orchestrator.py | 243 ++++++------- .../modal_orchestrator_entrypoint.py | 79 +++++ ...l_orchestrator_entrypoint_configuration.py | 84 +++++ src/zenml/integrations/modal/utils.py | 330 ------------------ 5 files changed, 323 insertions(+), 550 deletions(-) create mode 100644 src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py create mode 100644 src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index 3c84aabdbd1..716ccc1b079 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -6,7 +6,7 @@ description: Orchestrating your pipelines to run on Modal's serverless cloud pla Using the ZenML `modal` integration, you can orchestrate and scale your ML pipelines on [Modal's](https://modal.com/) serverless cloud platform with minimal setup and maximum efficiency. -The Modal orchestrator is designed for speed and cost-effectiveness, running entire pipelines using an intelligent app persistence strategy that reuses warm containers while ensuring proper isolation between different builds. +The Modal orchestrator is designed for speed and cost-effectiveness, running entire pipelines using Modal sandboxes with persistent app architecture for maximum flexibility and efficiency. {% hint style="warning" %} This component is only meant to be used within the context of a [remote ZenML deployment scenario](https://docs.zenml.io/getting-started/deploying-zenml/). Usage with a local ZenML deployment may lead to unexpected behavior! @@ -26,7 +26,7 @@ You should use the Modal orchestrator if: The Modal orchestrator may not be the best choice if: -* **You need fine-grained step isolation**: Modal runs entire pipelines in single functions, which means all steps share the same resources and environment. For pipelines requiring different resource configurations per step, consider the [Modal step operator](../step-operators/modal.md) instead. +* **You need fine-grained step isolation**: Modal orchestrator runs entire pipelines in single sandboxes, which means all steps share the same resources and environment. For pipelines requiring different resource configurations per step, consider the [Modal step operator](../step-operators/modal.md) instead. * **You have strict data locality requirements**: Modal runs in specific cloud regions and may not be suitable if you need to keep data processing within specific geographic boundaries or on-premises. @@ -227,19 +227,20 @@ def second_step(): ... ``` -### App Persistence Architecture +### Sandbox Architecture -The Modal orchestrator uses an intelligent app persistence strategy: +The Modal orchestrator uses a simplified sandbox-based architecture: -- **Apps are persistent per pipeline**: Each pipeline gets its own Modal app that stays warm -- **Functions are unique per build/run**: Different builds get separate functions for proper isolation -- **Automatic reuse**: Same pipeline with same dependencies reuses warm apps -- **Smart isolation**: Different dependencies trigger new deployments for safety +- **Persistent apps per pipeline**: Each pipeline gets its own Modal app that stays alive +- **Dynamic sandboxes for execution**: Each pipeline run creates a fresh sandbox for complete isolation +- **Built-in output streaming**: Modal automatically handles log streaming and output capture +- **Maximum flexibility**: Sandboxes can execute arbitrary commands and provide better isolation -This architecture provides the best of both worlds: -- **Performance**: Warm containers eliminate cold start delays -- **Isolation**: Different builds don't interfere with each other -- **Cost efficiency**: Apps are reused when safe to do so +This architecture provides optimal benefits: +- **Simplicity**: No complex app deployment or time window management +- **Flexibility**: Sandboxes offer more dynamic execution capabilities than functions +- **Isolation**: Each run gets a completely fresh execution environment +- **Performance**: Persistent apps eliminate deployment overhead ### Base image requirements @@ -373,44 +374,47 @@ modal_settings = ModalOrchestratorSettings( ) ``` -### How it works: App = Pipeline, Function = Build +### How it works: Persistent Apps + Dynamic Sandboxes {% hint style="info" %} -**Smart Architecture for Performance and Isolation** +**Simplified Architecture for Maximum Flexibility** -The ZenML Modal orchestrator uses an innovative "App = Pipeline, Function = Build" architecture: +The ZenML Modal orchestrator uses a streamlined "Persistent Apps + Dynamic Sandboxes" architecture: -**Pipeline-Level Apps**: +**Persistent Pipeline Apps**: - Each pipeline gets its own persistent Modal app (e.g., `zenml-pipeline-training-pipeline`) -- Apps stay warm and reusable across multiple runs of the same pipeline -- App names are stable, enabling long-term container warmth +- Apps stay alive across multiple runs using `modal.App.lookup(create_if_missing=True)` +- No complex time windows or deployment logic - truly persistent -**Build-Specific Functions**: -- Each unique build/dependency combination gets its own function within the app -- Function names include build hashes to ensure isolation (e.g., `run_build_abc123_def456`) -- Different dependencies = different functions = proper isolation -- Same dependencies = same function = maximum reuse +**Dynamic Execution Sandboxes**: +- Each pipeline run creates a fresh Modal sandbox for complete isolation +- Sandboxes execute arbitrary commands with maximum flexibility +- Built-in output streaming via `modal.enable_output()` +- Fresh execution environment prevents any conflicts between runs **Execution Flow**: -- Your entire pipeline runs in a single function call using `PipelineEntrypoint` -- Maximum speed with minimal overhead -- Warm containers provide near-instant startup -- Fresh execution context prevents conflicts between runs - -**Container Management**: -- Modal manages container lifecycle based on your `min_containers` and `max_containers` settings -- Warm containers stay ready with your Docker image and dependencies loaded -- Apps persist across runs, functions are deployed fresh when needed +- Your entire pipeline runs in a single sandbox using `PipelineEntrypoint` +- Simple app lookup or creation, then sandbox execution +- Automatic log streaming and output capture +- Complete isolation between different pipeline runs + +**Benefits**: +- **Simplicity**: No complex app deployment or reuse logic +- **Flexibility**: Sandboxes can execute any commands dynamically +- **Isolation**: Each run gets completely fresh execution context +- **Performance**: Persistent apps eliminate deployment overhead {% endhint %} -### Warm containers for faster execution +### Fast execution with persistent apps -Modal orchestrator uses persistent apps with warm containers to minimize cold starts: +Modal orchestrator uses persistent apps to minimize startup overhead: ```python modal_settings = ModalOrchestratorSettings( - min_containers=2, # Keep 2 containers warm - max_containers=20, # Scale up to 20 containers + region="us-east-1", # Preferred region + cloud="aws", # Cloud provider + modal_environment="main", # Modal environment + timeout=3600, # 1 hour timeout ) @pipeline( @@ -422,63 +426,32 @@ def my_pipeline(): ... ``` -This ensures your pipelines start executing immediately without waiting for container initialization. +This ensures your pipelines start executing quickly by reusing persistent apps and creating fresh sandboxes for isolation. {% hint style="warning" %} **Cost Implications and Optimization** -Understanding Modal orchestrator costs helps optimize your spend: +Understanding Modal orchestrator costs with sandbox architecture: -**Container Costs**: -- **Warm containers** (`min_containers > 0`): You pay for idle time even when pipelines aren't running -- **Cold containers**: Only pay when actually executing, but incur startup time (~30-60 seconds) -- **GPU containers**: Significantly more expensive than CPU-only containers for idle time +**Execution Costs**: +- **Pay-per-use**: You only pay when sandboxes are actively running +- **No idle costs**: Persistent apps don't incur costs when not executing +- **Sandbox overhead**: Minimal - sandboxes start quickly on persistent apps -**App Deployment Costs**: -- **App reuse**: No additional cost when reusing apps within the time window (default: 2 hours) -- **New deployments**: Small deployment overhead for each new app (new time window or changed Docker image) - -**Execution Mode Costs**: -- **Pipeline mode**: Most cost-effective - single function call for entire pipeline -- **Per-step mode**: Higher cost due to multiple function calls, but better for debugging +**Resource Optimization**: +- **GPU usage**: Only allocated during actual pipeline execution +- **Memory and CPU**: Charged only for sandbox execution time +- **Storage**: Docker images are cached across runs on persistent apps **Cost Optimization Strategies**: -- **Development**: Use `min_containers=0` to avoid idle costs -- **Production (frequent)**: Use `min_containers=1-2` for pipelines running multiple times per hour -- **Production (infrequent)**: Use `min_containers=0` for pipelines running less than once per hour -- **GPU workloads**: Be especially careful with `min_containers` due to high GPU idle costs -- **Time windows**: Adjust `app_warming_window_hours` based on your pipeline frequency +- **Efficient pipelines**: Optimize pipeline execution time to reduce costs +- **Right-size resources**: Use appropriate CPU/memory/GPU for your workload +- **Regional selection**: Choose regions close to your data sources +- **Timeout management**: Set appropriate timeouts to avoid runaway costs -Monitor your Modal dashboard to track container utilization and costs, then adjust settings accordingly. +Monitor your Modal dashboard to track sandbox execution time and resource usage for cost optimization. {% endhint %} -### App reuse and warming windows - -You can control how long Modal apps stay deployed and available for reuse: - -```python -modal_settings = ModalOrchestratorSettings( - app_warming_window_hours=4.0, # Keep apps deployed for 4 hours - min_containers=1, # Keep 1 container warm - max_containers=5 # Scale up to 5 containers -) - -@pipeline(settings={"orchestrator": modal_settings}) -def my_pipeline(): - # This pipeline will reuse the same Modal app if run within 4 hours - # and the Docker image hasn't changed - ... -``` - -**App Reuse Benefits**: -- **Faster execution**: No app deployment time for subsequent runs -- **Cost efficiency**: No repeated deployment overhead -- **Consistent environment**: Same app instance for related pipeline runs - -**When apps are recreated**: -- After the warming window expires (default: 2 hours) -- When the Docker image changes (new dependencies, code changes) -- When resource requirements change significantly ## Best practices diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 4078c14ea43..b55248d2835 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -13,43 +13,40 @@ # permissions and limitations under the License. """Implementation of a Modal orchestrator.""" -import hashlib +import asyncio import os -import time -import traceback from typing import ( TYPE_CHECKING, Any, Dict, Iterator, + List, Optional, - Tuple, Type, cast, ) from uuid import uuid4 +import modal + from zenml.config.base_settings import BaseSettings from zenml.config.constants import RESOURCE_SETTINGS_KEY -from zenml.entrypoints.pipeline_entrypoint_configuration import ( - PipelineEntrypointConfiguration, +from zenml.integrations.modal.orchestrators.modal_orchestrator_entrypoint_configuration import ( + ModalOrchestratorEntrypointConfiguration, ) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, build_modal_image, create_modal_stack_validator, get_gpu_values, - get_or_deploy_persistent_modal_app, get_resource_settings_from_deployment, get_resource_values, setup_modal_client, - stream_modal_logs_and_wait, ) from zenml.logger import get_logger from zenml.metadata.metadata_types import MetadataType from zenml.orchestrators import ContainerizedOrchestrator from zenml.stack import Stack, StackValidator -from zenml.utils import string_utils if TYPE_CHECKING: from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( @@ -61,53 +58,11 @@ logger = get_logger(__name__) -def run_entire_pipeline( - deployment_id: str, - orchestrator_run_id: str, -) -> None: - """Execute entire pipeline using PipelineEntrypointConfiguration for maximum efficiency. - - Args: - deployment_id: ID of the pipeline deployment. - orchestrator_run_id: ID of the orchestrator run. - - Raises: - Exception: If pipeline execution fails. - """ - # Set the orchestrator run ID in the Modal environment - os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id - - try: - logger.debug("Initializing pipeline entrypoint configuration") - logger.debug(f"Deployment ID: {deployment_id}") - logger.debug(f"Orchestrator Run ID: {orchestrator_run_id}") - - # Create the entrypoint arguments - args = PipelineEntrypointConfiguration.get_entrypoint_arguments( - deployment_id=deployment_id - ) - - logger.debug("Creating pipeline configuration") - config = PipelineEntrypointConfiguration(arguments=args) - - logger.info("Executing entire pipeline") - config.run() - - logger.info("Entire pipeline completed successfully") - - except Exception as e: - error_details = traceback.format_exc() - logger.error(f"Error executing pipeline: {e}") - logger.debug(f"Full traceback:\n{error_details}") - raise - - class ModalOrchestrator(ContainerizedOrchestrator): """Orchestrator responsible for running entire pipelines on Modal. - This orchestrator runs complete pipelines in a single Modal function - for maximum speed and efficiency, avoiding the overhead of multiple - step executions. + This orchestrator runs complete pipelines using Modal sandboxes + for maximum flexibility and efficiency with persistent app architecture. """ @property @@ -251,104 +206,116 @@ def prepare_or_run_pipeline( gpu_values = get_gpu_values(settings.gpu, resource_settings) cpu_count, memory_mb = get_resource_values(resource_settings) - start_time = time.time() - - # Execute steps using Modal's fast container spin-up with persistent app - logger.info( - "Starting pipeline execution with persistent serverless functions" - ) + # Execute pipeline using Modal sandboxes for maximum flexibility + logger.info("Starting pipeline execution with Modal sandboxes") - # NEW ARCHITECTURE: App = Pipeline (persistent), Function = Build (isolated) - # App stays warm per pipeline, functions handle different builds within the app + # SANDBOX ARCHITECTURE: Simple persistent app per pipeline pipeline_name = deployment.pipeline_configuration.name.replace( "_", "-" ) - app_name_base = f"zenml-pipeline-{pipeline_name}" + app_name = f"zenml-pipeline-{pipeline_name}" - # Function name based on build + run for complete isolation - image_name = self.get_image(deployment=deployment) - image_hash = hashlib.md5(image_name.encode()).hexdigest()[:8] - run_suffix = orchestrator_run_id[-8:] # Last 8 chars of run ID - function_name = f"run_build_{image_hash}_{run_suffix}" - - execute_step, full_app_name = get_or_deploy_persistent_modal_app( - app_name_base=app_name_base, - zenml_image=zenml_image, - execution_func=run_entire_pipeline, - function_name=function_name, - deployment=deployment, - gpu_values=gpu_values, - cpu_count=cpu_count, # Use ResourceSettings value or None (Modal default) - memory_mb=memory_mb, # Use ResourceSettings value or None (Modal default) - cloud=settings.cloud or self.config.cloud, - region=settings.region or self.config.region, - timeout=settings.timeout or self.config.timeout, - min_containers=settings.min_containers - or self.config.min_containers, - max_containers=settings.max_containers - or self.config.max_containers, - environment_name=settings.modal_environment - or self.config.modal_environment, # Use modal_environment from config/settings - app_warming_window_hours=settings.app_warming_window_hours - or self.config.app_warming_window_hours, - ) - - logger.info( - "Executing with deployed serverless application and warm containers" + # Build entrypoint command and args for the orchestrator sandbox + command = ( + ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() ) - - # Execute based on execution mode with improved Modal Function API usage - sync_execution = ( - settings.synchronous - if hasattr(settings, "synchronous") - else self.config.synchronous + args = ( + ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( + deployment_id=deployment.id, + orchestrator_run_id=orchestrator_run_id, + run_id=placeholder_run.id if placeholder_run else None, + ) ) + entrypoint_command = command + args - def execute_modal_function( - func_args: Tuple[Any, ...], description: str - ) -> Any: - """Execute Modal function with proper sync/async control and log streaming. - - Args: - func_args: Arguments to pass to the Modal function. - description: Description of the operation for logging. - - Returns: - Result of the Modal function execution. - """ - # Always use .spawn() to get a FunctionCall object for log streaming - function_call = execute_step.spawn(*func_args) - - if sync_execution: - logger.debug("Using synchronous execution with log streaming") - # Stream logs while waiting for completion using app name - return stream_modal_logs_and_wait( - function_call, description, full_app_name - ) - else: - logger.debug("Using asynchronous fire-and-forget execution") - logger.info( - f"{description} started asynchronously (not waiting for completion)" - ) - return function_call - - # Execute entire pipeline in one function + # Execute using sandbox try: - execute_modal_function( - (deployment.id, orchestrator_run_id), - "Pipeline execution", + asyncio.run( + self._execute_pipeline_sandbox( + app_name=app_name, + zenml_image=zenml_image, + entrypoint_command=entrypoint_command, + gpu_values=gpu_values, + cpu_count=cpu_count, + memory_mb=memory_mb, + cloud=settings.cloud or self.config.cloud, + region=settings.region or self.config.region, + timeout=settings.timeout or self.config.timeout, + environment_name=settings.modal_environment + or self.config.modal_environment, + synchronous=settings.synchronous + if hasattr(settings, "synchronous") + else self.config.synchronous, + ) ) except Exception as e: - logger.error(f"Pipeline failed: {e}") - logger.info("Check platform dashboard for detailed logs") + logger.error(f"Pipeline execution failed: {e}") + logger.info("Check Modal dashboard for detailed logs") raise - run_duration = time.time() - start_time + logger.info("Pipeline execution completed successfully") + + return None + + async def _execute_pipeline_sandbox( + self, + app_name: str, + zenml_image: Any, + entrypoint_command: List[str], + gpu_values: Optional[str] = None, + cpu_count: Optional[int] = None, + memory_mb: Optional[int] = None, + cloud: Optional[str] = None, + region: Optional[str] = None, + timeout: int = 86400, + environment_name: Optional[str] = None, + synchronous: bool = True, + ) -> None: + """Execute pipeline using Modal sandbox. - # Log completion - logger.info( - "Pipeline run has finished in `%s`.", - string_utils.get_human_readable_time(run_duration), + Args: + app_name: Name of the Modal app + zenml_image: Pre-built ZenML Docker image for Modal + entrypoint_command: Command to execute in the sandbox + gpu_values: GPU configuration string + cpu_count: Number of CPU cores + memory_mb: Memory allocation in MB + cloud: Cloud provider to use + region: Region to deploy in + timeout: Maximum execution timeout + environment_name: Modal environment name + synchronous: Whether to wait for completion + """ + # Create persistent app (will reuse if exists) + app = modal.App.lookup( + app_name, create_if_missing=True, environment_name=environment_name ) - return None + logger.info(f"Using Modal app: {app_name}") + + logger.info("Creating sandbox for pipeline execution") + + with modal.enable_output(): + # Create sandbox with the entrypoint command + sb = await modal.Sandbox.create.aio( + *entrypoint_command, # Pass as separate arguments to avoid shell quoting issues + image=zenml_image, + gpu=gpu_values, + cpu=cpu_count, + memory=memory_mb, + cloud=cloud, + region=region, + app=app, + timeout=timeout, + ) + + logger.info("Sandbox created, executing pipeline...") + + if synchronous: + # Wait for completion and stream output + await sb.wait.aio() + logger.info("Pipeline execution completed") + else: + logger.info( + "Pipeline started asynchronously (not waiting for completion)" + ) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py new file mode 100644 index 00000000000..86c25fb518e --- /dev/null +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -0,0 +1,79 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Entrypoint of the Modal orchestrator sandbox.""" + +import argparse +import os + +from zenml.entrypoints.pipeline_entrypoint_configuration import ( + PipelineEntrypointConfiguration, +) +from zenml.integrations.modal.utils import ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +def parse_args() -> argparse.Namespace: + """Parse entrypoint arguments. + + Returns: + Parsed args. + """ + parser = argparse.ArgumentParser() + parser.add_argument("--deployment_id", type=str, required=True) + parser.add_argument("--orchestrator_run_id", type=str, required=True) + parser.add_argument("--run_id", type=str, required=False) + return parser.parse_args() + + +def main() -> None: + """Entrypoint of the Modal orchestrator sandbox.""" + # Log to the container's stdout so it can be streamed by Modal + logger.info("Modal orchestrator sandbox started.") + + # Parse arguments + args = parse_args() + + # Set the orchestrator run ID in the environment + os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = args.orchestrator_run_id + + logger.info(f"Deployment ID: {args.deployment_id}") + logger.info(f"Orchestrator Run ID: {args.orchestrator_run_id}") + if args.run_id: + logger.info(f"Pipeline Run ID: {args.run_id}") + + try: + # Create the entrypoint arguments for pipeline execution + entrypoint_args = ( + PipelineEntrypointConfiguration.get_entrypoint_arguments( + deployment_id=args.deployment_id + ) + ) + + logger.info("Creating pipeline configuration") + config = PipelineEntrypointConfiguration(arguments=entrypoint_args) + + logger.info("Executing entire pipeline") + config.run() + + logger.info("Pipeline execution completed successfully") + + except Exception as e: + logger.error(f"Pipeline execution failed: {e}") + raise + + +if __name__ == "__main__": + main() diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py new file mode 100644 index 00000000000..ae4398bb789 --- /dev/null +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py @@ -0,0 +1,84 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Entrypoint configuration for the Modal orchestrator sandbox.""" + +from typing import TYPE_CHECKING, List, Optional, Set + +if TYPE_CHECKING: + from uuid import UUID + +DEPLOYMENT_ID_OPTION = "deployment_id" +ORCHESTRATOR_RUN_ID_OPTION = "orchestrator_run_id" +RUN_ID_OPTION = "run_id" + + +class ModalOrchestratorEntrypointConfiguration: + """Entrypoint configuration for the Modal orchestrator sandbox.""" + + @classmethod + def get_entrypoint_options(cls) -> Set[str]: + """Gets all the options required for running this entrypoint. + + Returns: + Entrypoint options. + """ + options = { + DEPLOYMENT_ID_OPTION, + ORCHESTRATOR_RUN_ID_OPTION, + } + return options + + @classmethod + def get_entrypoint_command(cls) -> List[str]: + """Returns a command that runs the entrypoint module. + + Returns: + Entrypoint command. + """ + command = [ + "python", + "-m", + "zenml.integrations.modal.orchestrators.modal_orchestrator_entrypoint", + ] + return command + + @classmethod + def get_entrypoint_arguments( + cls, + deployment_id: "UUID", + orchestrator_run_id: str, + run_id: Optional["UUID"] = None, + ) -> List[str]: + """Gets all arguments that the entrypoint command should be called with. + + Args: + deployment_id: ID of the deployment. + orchestrator_run_id: ID of the orchestrator run. + run_id: Optional ID of the pipeline run. + + Returns: + List of entrypoint arguments. + """ + args = [ + f"--{DEPLOYMENT_ID_OPTION}", + str(deployment_id), + f"--{ORCHESTRATOR_RUN_ID_OPTION}", + orchestrator_run_id, + ] + + if run_id: + args.append(f"--{RUN_ID_OPTION}") + args.append(str(run_id)) + + return args diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index f206fcb8fcc..7e14633144f 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -14,9 +14,6 @@ """Shared utilities for Modal integration components.""" import os -import subprocess -import threading -import time from typing import Any, Dict, List, Optional, Tuple, Union try: @@ -36,101 +33,6 @@ ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID = "ZENML_MODAL_ORCHESTRATOR_RUN_ID" -class ModalLogStreamer: - """Stream logs from Modal CLI in a separate thread.""" - - def __init__(self, app_name: str, call_id: Optional[str], logger: Any): - """Initialize the log streamer. - - Args: - app_name: Name of the Modal app to stream logs from. - call_id: Optional function call ID for filtering logs. - logger: Logger instance to use for output. - """ - self.app_name = app_name - self.call_id = call_id - self.logger = logger - self.log_stream_active = threading.Event() - self.log_thread: Optional[threading.Thread] = None - - def start(self) -> None: - """Start log streaming in a background thread.""" - self.log_stream_active.set() - self.log_thread = threading.Thread( - target=self._stream_logs, daemon=True - ) - self.log_thread.start() - - def stop(self) -> None: - """Stop log streaming.""" - self.log_stream_active.clear() - if self.log_thread: - # Give the log thread a moment to clean up - time.sleep(0.5) - - def _stream_logs(self) -> None: - """Stream logs from Modal CLI.""" - try: - # Use modal CLI to stream logs (automatically streams while app is active) - cmd = [ - "modal", - "app", - "logs", - self.app_name, - "--timestamps", - ] - self.logger.debug(f"Starting log stream: {' '.join(cmd)}") - with subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, # Line buffered - universal_newlines=True, - ) as process: - while ( - self.log_stream_active.is_set() and process.poll() is None - ): - if process.stdout: - line = process.stdout.readline() - if line: - # Clean up the log line and forward to our logger - log_msg = line.strip() - - # Skip empty lines and "No logs" messages - if not log_msg or log_msg.startswith("No logs"): - continue - - # Filter logs based on function call ID when available - # If no call ID, show all logs (Modal CLI handles recency) - if self.call_id and self.call_id in log_msg: - # This log definitely belongs to our execution - self.logger.info(f"{log_msg}") - elif not self.call_id: - # No call ID available, show all logs from Modal CLI stream - # Modal CLI already filters for recent/relevant logs - self.logger.info(f"{log_msg}") - # Else: skip logs that don't match our call ID - - else: - break - - # Clean up process - if process.poll() is None: - process.terminate() - try: - process.wait(timeout=5) - except subprocess.TimeoutExpired: - process.kill() - - except FileNotFoundError: - self.logger.warning( - "Modal CLI not found. Install with: pip install modal" - ) - except Exception as e: - self.logger.debug(f"Log streaming error: {e}") - - class ModalAuthenticationError(Exception): """Exception raised for Modal authentication issues with helpful guidance.""" @@ -385,164 +287,6 @@ def _validate_remote_components(stack: "Stack") -> Tuple[bool, str]: ) -def get_or_deploy_persistent_modal_app( - app_name_base: str, - zenml_image: Any, - execution_func: Any, - function_name: str, - deployment: Any, - gpu_values: Optional[str] = None, - cpu_count: Optional[int] = None, - memory_mb: Optional[int] = None, - cloud: Optional[str] = None, - region: Optional[str] = None, - timeout: int = 86400, - min_containers: Optional[int] = None, - max_containers: Optional[int] = None, - environment_name: Optional[str] = None, - app_warming_window_hours: float = 2.0, -) -> Tuple[Any, str]: - """Get or deploy a persistent Modal app with warm containers. - - This function deploys a Modal app that stays alive with warm containers - for maximum speed between runs. The app name includes both time window - and build checksum to ensure fresh deployments only when builds actually change. - - Args: - app_name_base: Base name for the app (will be suffixed with timestamp and build hash). - zenml_image: Pre-built ZenML Docker image for Modal. - execution_func: The function to execute in the Modal app. - function_name: Name of the function in the app. - deployment: The pipeline deployment containing build information. - gpu_values: GPU configuration string. - cpu_count: Number of CPU cores. - memory_mb: Memory allocation in MB. - cloud: Cloud provider to use. - region: Region to deploy in. - timeout: Maximum execution timeout. - min_containers: Minimum containers to keep warm. - max_containers: Maximum containers to scale to. - environment_name: Modal environment name. - app_warming_window_hours: Hours for app name window to enable reuse. - - Returns: - Tuple of (Modal function ready for execution, full app name). - - Raises: - ModalAuthenticationError: If platform authentication fails. - """ - # Create timestamp window for app reuse (rounds down to nearest window boundary) - current_time = int(time.time()) - window_seconds = int( - app_warming_window_hours * 3600 - ) # Convert hours to seconds - time_window = current_time // window_seconds - - # Generate build identifier to ensure fresh deployments only when builds actually change - # Use deployment build checksum which only changes when Docker settings, requirements, etc. change - build_hash = "no-build" - if deployment.build and deployment.build.checksum: - # Use first 8 characters of build checksum for compact identifier - build_hash = deployment.build.checksum[:8] - logger.debug(f"Using build checksum: {deployment.build.checksum}") - else: - logger.warning( - "No build checksum available, using fallback identifier" - ) - - # Include both time window and build hash in app name - app_name = f"{app_name_base}-{time_window}-{build_hash}" - - logger.info( - f"Getting/deploying persistent serverless application: {app_name}" - ) - logger.debug( - f"App name includes time window: {time_window}, build hash: {build_hash}" - ) - - # Create the app - app = modal.App(app_name) - - # Ensure we have minimum containers for fast startup - effective_min_containers = min_containers or 1 - effective_max_containers = max_containers or 10 - - execute_step_func = app.function( - image=zenml_image, - gpu=gpu_values, - cpu=cpu_count, - memory=memory_mb, - cloud=cloud, - region=region, - timeout=timeout, - min_containers=effective_min_containers, # Keep containers warm for speed - max_containers=effective_max_containers, # Allow scaling - )(execution_func) - - # Try to lookup existing app with matching time window and image, deploy if not found - try: - logger.debug( - f"Checking for serverless application with time window {time_window} and build hash {build_hash}: {app_name}" - ) - - try: - modal.App.lookup( - app_name, environment_name=environment_name or "main" - ) - logger.info( - f"Found existing app '{app_name}' with matching build and fresh time window - reusing warm containers" - ) - - # For the app=pipeline, function=build architecture, we always redeploy - # to ensure fresh function deployment even if app exists - logger.info( - f"App exists but redeploying to ensure fresh function '{function_name}' deployment" - ) - - except Exception: - # App not found or other lookup error - deploy fresh app - logger.debug( - "No app found for current time window and build hash, deploying fresh app" - ) - - # Deploy the app with better error handling - try: - app.deploy( - name=app_name, environment_name=environment_name or "main" - ) - logger.info( - f"App '{app_name}' deployed with {effective_min_containers} warm containers" - ) - except Exception as deploy_error: - error_message = str(deploy_error) - if ( - "Token ID is malformed" in error_message - or "UNAUTHENTICATED" in error_message - ): - raise ModalAuthenticationError( - "Platform authentication failed. Token ID or secret is invalid.", - suggestions=[ - "Check that token_id starts with 'ak-' and token_secret starts with 'as-'", - "Get new tokens from the platform dashboard", - "Or run 'modal token new' to set up ~/.modal.toml authentication", - "Ensure both token_id AND token_secret are provided in orchestrator config", - ], - ) from deploy_error - else: - # Re-raise other deployment errors as-is - raise - - except Exception as e: - logger.error(f"Deployment failed: {e}") - raise - - logger.info( - f"Serverless application configured with min_containers={effective_min_containers}, max_containers={effective_max_containers}" - ) - - return execute_step_func, app_name - - def get_resource_settings_from_deployment( deployment: Any, resource_settings_key: str = "resources", @@ -607,77 +351,3 @@ def get_resource_settings_from_deployment( resource_settings = ResourceSettings() # Default empty settings return resource_settings - - -def stream_modal_logs_and_wait( - function_call: Any, - description: str, - app_name: str, - check_interval: float = 2.0, -) -> Any: - """Stream logs from Modal app using CLI and wait for FunctionCall completion. - - Args: - function_call: The Modal FunctionCall object from .spawn() - description: Description of the operation for logging. - app_name: Name of the Modal app to stream logs from. - check_interval: How often to check for completion (seconds). - - Returns: - The result of the function execution. - - Raises: - Exception: If the Modal function execution fails. - KeyboardInterrupt: If the user cancels the execution. - """ - logger.info(f"Starting {description}") - - # Get function call ID for filtering (if available) - call_id = None - try: - call_id = getattr(function_call, "object_id", None) - if call_id: - logger.debug(f"Function call ID: {call_id}") - except Exception: - pass - - # Wait a moment for the function to start before beginning log streaming - # This helps avoid capturing old logs from previous runs - time.sleep(1) - - # Create and start log streaming - log_streamer = ModalLogStreamer(app_name, call_id, logger) - log_streamer.start() - - try: - # Poll for function completion - start_time = time.time() - while True: - try: - # Try to get result with timeout=0 (non-blocking) - result = function_call.get(timeout=0) - elapsed = time.time() - start_time - logger.info( - f"{description} completed successfully after {elapsed:.1f}s" - ) - return result - except TimeoutError: - # Function still running, continue waiting - time.sleep(check_interval) - except Exception as e: - # Function failed - elapsed = time.time() - start_time - logger.error(f"{description} failed after {elapsed:.1f}s: {e}") - raise - - except KeyboardInterrupt: - logger.info(f"Cancelling {description}") - try: - function_call.cancel() - logger.info("Function cancelled successfully") - except Exception as cancel_error: - logger.warning(f"Could not cancel function: {cancel_error}") - raise - finally: - # Stop log streaming - log_streamer.stop() From 04d0204c7b4d4ce9963a9b2610c828bd677705d2 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 09:05:08 +0200 Subject: [PATCH 30/77] Update Modal orchestrator for per-step sandboxes --- .../flavors/modal_orchestrator_flavor.py | 26 +- .../modal/orchestrators/modal_orchestrator.py | 76 +++-- .../modal_orchestrator_entrypoint.py | 303 ++++++++++++++++-- 3 files changed, 351 insertions(+), 54 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 808d1537d12..7b40b942369 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -30,9 +30,9 @@ class ModalExecutionMode(str, Enum): """Execution modes for the Modal orchestrator. Attributes: - PIPELINE: Execute entire pipeline in one Modal function (fastest, default). - PER_STEP: Execute each step in a separate Modal function (granular - control). + PIPELINE: Execute entire pipeline in one Modal sandbox (fastest, default). + PER_STEP: Execute each step in a separate Modal sandbox (granular control, + better for debugging, allows step-specific resources). """ PIPELINE = "pipeline" @@ -44,18 +44,14 @@ class ModalOrchestratorSettings(BaseSettings): Attributes: gpu: The type of GPU to use for the pipeline execution (e.g., "T4", - "A100"). - Use ResourceSettings.gpu_count to specify the number of GPUs. + "A100"). Use ResourceSettings.gpu_count to specify the number of GPUs. region: The region to use for the pipeline execution. cloud: The cloud provider to use for the pipeline execution. modal_environment: The Modal environment to use for the pipeline execution. timeout: Maximum execution time in seconds (default 24h). - min_containers: Minimum containers to keep warm (replaces keep_warm). - max_containers: Maximum concurrent containers (replaces concurrency_limit). - execution_mode: Execution mode - PIPELINE (default, fastest) or PER_STEP (granular control). + execution_mode: Execution mode - PIPELINE (fastest) or PER_STEP (granular). + max_parallelism: Maximum number of parallel sandboxes (for PER_STEP mode). synchronous: Wait for completion (True) or fire-and-forget (False). - app_warming_window_hours: Hours for app name window to enable container reuse. - Apps are reused within this time window for efficiency. Default 2 hours. """ gpu: Optional[str] = None @@ -63,19 +59,15 @@ class ModalOrchestratorSettings(BaseSettings): cloud: Optional[str] = None modal_environment: Optional[str] = None timeout: int = 86400 # 24 hours (Modal's maximum) - min_containers: Optional[int] = ( - 1 # Keep 1 container warm for sequential execution - ) - max_containers: Optional[int] = 10 # Allow up to 10 concurrent containers execution_mode: ModalExecutionMode = ( ModalExecutionMode.PIPELINE ) # Default to fastest mode + max_parallelism: Optional[int] = ( + None # Maximum number of parallel sandboxes (for PER_STEP mode) + ) synchronous: bool = ( True # Wait for completion (True) or fire-and-forget (False) ) - app_warming_window_hours: float = ( - 2.0 # Default 2-hour window for app reuse - ) class ModalOrchestratorConfig( diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index b55248d2835..2ae27f6f0d6 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -31,6 +31,9 @@ from zenml.config.base_settings import BaseSettings from zenml.config.constants import RESOURCE_SETTINGS_KEY +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalExecutionMode, +) from zenml.integrations.modal.orchestrators.modal_orchestrator_entrypoint_configuration import ( ModalOrchestratorEntrypointConfiguration, ) @@ -194,6 +197,12 @@ def prepare_or_run_pipeline( "ModalOrchestratorSettings", self.get_settings(deployment) ) + # Check execution mode + execution_mode = getattr( + settings, "execution_mode", ModalExecutionMode.PIPELINE + ) + logger.info(f"Using execution mode: {execution_mode}") + # Get resource settings from pipeline configuration resource_settings = get_resource_settings_from_deployment( deployment, RESOURCE_SETTINGS_KEY @@ -228,26 +237,49 @@ def prepare_or_run_pipeline( ) entrypoint_command = command + args - # Execute using sandbox + # Execute using sandbox based on execution mode try: - asyncio.run( - self._execute_pipeline_sandbox( - app_name=app_name, - zenml_image=zenml_image, - entrypoint_command=entrypoint_command, - gpu_values=gpu_values, - cpu_count=cpu_count, - memory_mb=memory_mb, - cloud=settings.cloud or self.config.cloud, - region=settings.region or self.config.region, - timeout=settings.timeout or self.config.timeout, - environment_name=settings.modal_environment - or self.config.modal_environment, - synchronous=settings.synchronous - if hasattr(settings, "synchronous") - else self.config.synchronous, + if execution_mode == ModalExecutionMode.PIPELINE: + # Execute entire pipeline in one sandbox (fastest) + asyncio.run( + self._execute_pipeline_sandbox( + app_name=app_name, + zenml_image=zenml_image, + entrypoint_command=entrypoint_command, + gpu_values=gpu_values, + cpu_count=cpu_count, + memory_mb=memory_mb, + cloud=settings.cloud or self.config.cloud, + region=settings.region or self.config.region, + timeout=settings.timeout or self.config.timeout, + environment_name=settings.modal_environment + or self.config.modal_environment, + synchronous=settings.synchronous + if hasattr(settings, "synchronous") + else self.config.synchronous, + ) + ) + else: + # PER_STEP mode: Execute each step in separate sandbox + # This is handled by the entrypoint using ThreadedDagRunner + asyncio.run( + self._execute_pipeline_sandbox( + app_name=app_name, + zenml_image=zenml_image, + entrypoint_command=entrypoint_command, + gpu_values=gpu_values, + cpu_count=cpu_count, + memory_mb=memory_mb, + cloud=settings.cloud or self.config.cloud, + region=settings.region or self.config.region, + timeout=settings.timeout or self.config.timeout, + environment_name=settings.modal_environment + or self.config.modal_environment, + synchronous=settings.synchronous + if hasattr(settings, "synchronous") + else self.config.synchronous, + ) ) - ) except Exception as e: logger.error(f"Pipeline execution failed: {e}") logger.info("Check Modal dashboard for detailed logs") @@ -312,7 +344,13 @@ async def _execute_pipeline_sandbox( logger.info("Sandbox created, executing pipeline...") if synchronous: - # Wait for completion and stream output + # Stream output while waiting for completion + logger.info("Streaming pipeline execution logs...") + async for line in sb.stdout.aio(): + # Stream logs to stdout with proper formatting + print(line, end="") + + # Ensure completion await sb.wait.aio() logger.info("Pipeline execution completed") else: diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 86c25fb518e..45f7bc6eab5 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -14,13 +14,41 @@ """Entrypoint of the Modal orchestrator sandbox.""" import argparse +import asyncio import os +from typing import Any, Dict, cast +from uuid import UUID +import modal + +from zenml.client import Client +from zenml.config.constants import RESOURCE_SETTINGS_KEY from zenml.entrypoints.pipeline_entrypoint_configuration import ( PipelineEntrypointConfiguration, ) -from zenml.integrations.modal.utils import ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID +from zenml.entrypoints.step_entrypoint_configuration import ( + StepEntrypointConfiguration, +) +from zenml.enums import ExecutionStatus +from zenml.exceptions import AuthorizationException +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalExecutionMode, + ModalOrchestratorSettings, +) +from zenml.integrations.modal.orchestrators.modal_orchestrator import ( + ModalOrchestrator, +) +from zenml.integrations.modal.utils import ( + ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, + build_modal_image, + get_gpu_values, + get_resource_values, + setup_modal_client, +) from zenml.logger import get_logger +from zenml.orchestrators import publish_utils +from zenml.orchestrators.dag_runner import NodeStatus, ThreadedDagRunner +from zenml.orchestrators.utils import get_config_environment_vars logger = get_logger(__name__) @@ -38,35 +66,274 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() +def run_step_on_modal( + step_name: str, + deployment: Any, + settings: ModalOrchestratorSettings, + environment: Dict[str, str], +) -> None: + """Run a pipeline step in a separate Modal sandbox. + + Args: + step_name: Name of the step. + deployment: The deployment configuration. + settings: Modal orchestrator settings. + environment: Environment variables. + + Raises: + Exception: If the sandbox fails to execute. + """ + logger.info(f"Running step '{step_name}' in Modal sandbox") + + # Get step-specific settings if any + step_config = deployment.step_configurations[step_name].config + step_settings = step_config.settings.get("orchestrator.modal", None) + if step_settings: + step_modal_settings = ModalOrchestratorSettings.model_validate( + step_settings.model_dump() if step_settings else {} + ) + # Merge with pipeline-level settings + for key, value in step_modal_settings.model_dump( + exclude_unset=True + ).items(): + if value is not None: + setattr(settings, key, value) + + # Get resource settings for this step + resource_settings = step_config.settings.get(RESOURCE_SETTINGS_KEY) + + # Build Modal image for this step + client = Client() + active_stack = client.active_stack + image_name = ModalOrchestrator.get_image( + deployment=deployment, step_name=step_name + ) + zenml_image = build_modal_image(image_name, active_stack, environment) + + # Configure resources + gpu_values = get_gpu_values(settings.gpu, resource_settings) + cpu_count, memory_mb = get_resource_values(resource_settings) + + # Create step entrypoint command + step_command = StepEntrypointConfiguration.get_entrypoint_command() + step_args = StepEntrypointConfiguration.get_entrypoint_arguments( + step_name=step_name, deployment_id=deployment.id + ) + entrypoint_command = step_command + step_args + + # Create app name for this step + pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") + app_name = f"zenml-pipeline-{pipeline_name}" + + # Execute step in sandbox + try: + asyncio.run( + _execute_step_sandbox( + app_name=app_name, + step_name=step_name, + zenml_image=zenml_image, + entrypoint_command=entrypoint_command, + gpu_values=gpu_values, + cpu_count=cpu_count, + memory_mb=memory_mb, + cloud=settings.cloud, + region=settings.region, + timeout=settings.timeout, + environment_name=settings.modal_environment, + ) + ) + logger.info(f"Step {step_name} completed successfully") + except Exception as e: + logger.error(f"Step {step_name} failed: {e}") + raise + + +async def _execute_step_sandbox( + app_name: str, + step_name: str, + zenml_image: Any, + entrypoint_command: list, + gpu_values: str = None, + cpu_count: int = None, + memory_mb: int = None, + cloud: str = None, + region: str = None, + timeout: int = 86400, + environment_name: str = None, +) -> None: + """Execute a single step using Modal sandbox. + + Args: + app_name: Name of the Modal app + step_name: Name of the step + zenml_image: Pre-built ZenML Docker image for Modal + entrypoint_command: Command to execute in the sandbox + gpu_values: GPU configuration string + cpu_count: Number of CPU cores + memory_mb: Memory allocation in MB + cloud: Cloud provider to use + region: Region to deploy in + timeout: Maximum execution timeout + environment_name: Modal environment name + """ + # Get or create persistent app + app = modal.App.lookup( + app_name, create_if_missing=True, environment_name=environment_name + ) + + logger.info(f"Creating sandbox for step {step_name}") + + with modal.enable_output(): + # Create sandbox for this step + sb = await modal.Sandbox.create.aio( + *entrypoint_command, + image=zenml_image, + gpu=gpu_values, + cpu=cpu_count, + memory=memory_mb, + cloud=cloud, + region=region, + app=app, + timeout=timeout, + ) + + # Wait for step completion + await sb.wait.aio() + logger.info(f"Sandbox for step {step_name} completed") + + +def finalize_run( + node_states: Dict[str, NodeStatus], args: argparse.Namespace +) -> None: + """Finalize the run by updating step and pipeline run statuses. + + Args: + node_states: The states of the nodes. + args: Parsed command line arguments. + """ + try: + client = Client() + deployment = client.get_deployment(args.deployment_id) + + # Fetch the pipeline run + list_args: Dict[str, Any] = {} + if args.run_id: + list_args = dict(id=UUID(args.run_id)) + else: + list_args = dict(orchestrator_run_id=args.orchestrator_run_id) + + pipeline_runs = client.list_pipeline_runs( + hydrate=True, + project=deployment.project_id, + deployment_id=deployment.id, + **list_args, + ) + + if not len(pipeline_runs): + return + + pipeline_run = pipeline_runs[0] + pipeline_failed = False + + for step_name, node_state in node_states.items(): + if node_state != NodeStatus.FAILED: + continue + + pipeline_failed = True + + # Mark failed step runs as failed + step_run = pipeline_run.steps.get(step_name) + if step_run and step_run.status in { + ExecutionStatus.INITIALIZING, + ExecutionStatus.RUNNING, + }: + publish_utils.publish_failed_step_run(step_run.id) + + # Mark pipeline as failed if any steps failed + if pipeline_failed and pipeline_run.status in { + ExecutionStatus.INITIALIZING, + ExecutionStatus.RUNNING, + }: + publish_utils.publish_failed_pipeline_run(pipeline_run.id) + + except AuthorizationException: + # Token may be invalidated after completion, this is expected + pass + + def main() -> None: """Entrypoint of the Modal orchestrator sandbox.""" - # Log to the container's stdout so it can be streamed by Modal logger.info("Modal orchestrator sandbox started.") - # Parse arguments args = parse_args() - - # Set the orchestrator run ID in the environment os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = args.orchestrator_run_id - logger.info(f"Deployment ID: {args.deployment_id}") - logger.info(f"Orchestrator Run ID: {args.orchestrator_run_id}") - if args.run_id: - logger.info(f"Pipeline Run ID: {args.run_id}") + client = Client() + active_stack = client.active_stack + orchestrator = active_stack.orchestrator + assert isinstance(orchestrator, ModalOrchestrator) + + deployment = client.get_deployment(args.deployment_id) + pipeline_settings = cast( + ModalOrchestratorSettings, + orchestrator.get_settings(deployment), + ) + + # Setup Modal client + setup_modal_client( + token_id=orchestrator.config.token_id, + token_secret=orchestrator.config.token_secret, + workspace=orchestrator.config.workspace, + environment=orchestrator.config.modal_environment, + ) + + environment = get_config_environment_vars() + environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = args.orchestrator_run_id + + # Check execution mode + execution_mode = getattr( + pipeline_settings, "execution_mode", ModalExecutionMode.PIPELINE + ) try: - # Create the entrypoint arguments for pipeline execution - entrypoint_args = ( - PipelineEntrypointConfiguration.get_entrypoint_arguments( - deployment_id=args.deployment_id + if execution_mode == ModalExecutionMode.PIPELINE: + # Execute entire pipeline in this sandbox + logger.info("Executing entire pipeline in single sandbox") + entrypoint_args = ( + PipelineEntrypointConfiguration.get_entrypoint_arguments( + deployment_id=args.deployment_id + ) ) - ) + config = PipelineEntrypointConfiguration(arguments=entrypoint_args) + config.run() + + else: + # PER_STEP mode: Execute each step in separate sandbox + logger.info("Executing pipeline with per-step sandboxes") + + def run_step_wrapper(step_name: str) -> None: + run_step_on_modal( + step_name, deployment, pipeline_settings, environment + ) + + def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: + finalize_run(node_states, args) - logger.info("Creating pipeline configuration") - config = PipelineEntrypointConfiguration(arguments=entrypoint_args) + # Build DAG from deployment + pipeline_dag = { + step_name: step.spec.upstream_steps + for step_name, step in deployment.step_configurations.items() + } - logger.info("Executing entire pipeline") - config.run() + # Run using ThreadedDagRunner + ThreadedDagRunner( + dag=pipeline_dag, + run_fn=run_step_wrapper, + finalize_fn=finalize_wrapper, + max_parallelism=getattr( + pipeline_settings, "max_parallelism", None + ), + ).run() logger.info("Pipeline execution completed successfully") From 34f717a0b5423818b3263799471c372e6dd1d4e7 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 09:29:48 +0200 Subject: [PATCH 31/77] Add generate_sandbox_tags function and set tags in sandboxes --- .../modal/orchestrators/modal_orchestrator.py | 31 ++++++++- .../modal_orchestrator_entrypoint.py | 16 +++++ src/zenml/integrations/modal/utils.py | 69 ++++++++++++++++++- 3 files changed, 112 insertions(+), 4 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 2ae27f6f0d6..8e707515ee3 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -41,6 +41,7 @@ ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, build_modal_image, create_modal_stack_validator, + generate_sandbox_tags, get_gpu_values, get_resource_settings_from_deployment, get_resource_values, @@ -246,6 +247,11 @@ def prepare_or_run_pipeline( app_name=app_name, zenml_image=zenml_image, entrypoint_command=entrypoint_command, + deployment=deployment, + execution_mode=execution_mode.value, + run_id=str(placeholder_run.id) + if placeholder_run + else None, gpu_values=gpu_values, cpu_count=cpu_count, memory_mb=memory_mb, @@ -267,6 +273,11 @@ def prepare_or_run_pipeline( app_name=app_name, zenml_image=zenml_image, entrypoint_command=entrypoint_command, + deployment=deployment, + execution_mode=execution_mode.value, + run_id=str(placeholder_run.id) + if placeholder_run + else None, gpu_values=gpu_values, cpu_count=cpu_count, memory_mb=memory_mb, @@ -294,6 +305,9 @@ async def _execute_pipeline_sandbox( app_name: str, zenml_image: Any, entrypoint_command: List[str], + deployment: "PipelineDeploymentResponse", + execution_mode: str, + run_id: Optional[str] = None, gpu_values: Optional[str] = None, cpu_count: Optional[int] = None, memory_mb: Optional[int] = None, @@ -309,6 +323,9 @@ async def _execute_pipeline_sandbox( app_name: Name of the Modal app zenml_image: Pre-built ZenML Docker image for Modal entrypoint_command: Command to execute in the sandbox + deployment: Pipeline deployment for tagging + execution_mode: Execution mode for tagging + run_id: Pipeline run ID for tagging gpu_values: GPU configuration string cpu_count: Number of CPU cores memory_mb: Memory allocation in MB @@ -327,6 +344,15 @@ async def _execute_pipeline_sandbox( logger.info("Creating sandbox for pipeline execution") + # Generate tags for the sandbox + sandbox_tags = generate_sandbox_tags( + pipeline_name=deployment.pipeline_configuration.name, + deployment_id=str(deployment.id), + execution_mode=execution_mode, + run_id=run_id, + ) + logger.info(f"Sandbox tags: {sandbox_tags}") + with modal.enable_output(): # Create sandbox with the entrypoint command sb = await modal.Sandbox.create.aio( @@ -341,12 +367,15 @@ async def _execute_pipeline_sandbox( timeout=timeout, ) + # Set tags on the sandbox for organization + sb.set_tags(sandbox_tags) + logger.info("Sandbox created, executing pipeline...") if synchronous: # Stream output while waiting for completion logger.info("Streaming pipeline execution logs...") - async for line in sb.stdout.aio(): + async for line in sb.stdout: # Stream logs to stdout with proper formatting print(line, end="") diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 45f7bc6eab5..c6566aa4a20 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -41,6 +41,7 @@ from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, build_modal_image, + generate_sandbox_tags, get_gpu_values, get_resource_values, setup_modal_client, @@ -133,6 +134,7 @@ def run_step_on_modal( step_name=step_name, zenml_image=zenml_image, entrypoint_command=entrypoint_command, + deployment=deployment, gpu_values=gpu_values, cpu_count=cpu_count, memory_mb=memory_mb, @@ -153,6 +155,7 @@ async def _execute_step_sandbox( step_name: str, zenml_image: Any, entrypoint_command: list, + deployment: Any, gpu_values: str = None, cpu_count: int = None, memory_mb: int = None, @@ -168,6 +171,7 @@ async def _execute_step_sandbox( step_name: Name of the step zenml_image: Pre-built ZenML Docker image for Modal entrypoint_command: Command to execute in the sandbox + deployment: Pipeline deployment for tagging gpu_values: GPU configuration string cpu_count: Number of CPU cores memory_mb: Memory allocation in MB @@ -183,6 +187,15 @@ async def _execute_step_sandbox( logger.info(f"Creating sandbox for step {step_name}") + # Generate tags for the step sandbox + step_tags = generate_sandbox_tags( + pipeline_name=deployment.pipeline_configuration.name, + deployment_id=str(deployment.id), + execution_mode="PER_STEP", + step_name=step_name, + ) + logger.info(f"Step sandbox tags: {step_tags}") + with modal.enable_output(): # Create sandbox for this step sb = await modal.Sandbox.create.aio( @@ -197,6 +210,9 @@ async def _execute_step_sandbox( timeout=timeout, ) + # Set tags on the step sandbox + sb.set_tags(step_tags) + # Wait for step completion await sb.wait.aio() logger.info(f"Sandbox for step {step_name} completed") diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 7e14633144f..0e38d6b683c 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -13,6 +13,7 @@ # permissions and limitations under the License. """Shared utilities for Modal integration components.""" +import hashlib import os from typing import Any, Dict, List, Optional, Tuple, Union @@ -192,17 +193,38 @@ def get_resource_values( return cpu_count, memory_mb +def _generate_image_cache_key( + image_name: str, environment: Dict[str, str] +) -> str: + """Generate a cache key for Modal image based on inputs. + + Args: + image_name: Base Docker image name + environment: Environment variables + + Returns: + Hash string to use as cache key + """ + # Create deterministic string from inputs + cache_input = f"{image_name}|{sorted(environment.items())}" + return hashlib.sha256(cache_input.encode()).hexdigest()[:12] + + +# Removed _get_cached_modal_image_name as we're using Modal's internal caching + + def build_modal_image( image_name: str, stack: "Stack", environment: Dict[str, str], ) -> Any: - """Build a Modal image from a ZenML-built Docker image. + """Build a Modal image from a ZenML-built Docker image with caching. Args: image_name: The name of the Docker image to use as base. stack: The ZenML stack containing container registry. environment: Environment variables to set in the image. + force_rebuild: Force rebuilding even if cached image exists. Returns: The configured Modal image. @@ -218,6 +240,12 @@ def build_modal_image( "it is correctly configured." ) + # Generate cache key for this image configuration + cache_key = _generate_image_cache_key(image_name, environment) + + logger.info(f"Building Modal image (cache key: {cache_key})") + logger.info(f"Base image: {image_name}") + if docker_creds := stack.container_registry.credentials: docker_username, docker_password = docker_creds else: @@ -234,8 +262,8 @@ def build_modal_image( ) # Build Modal image from the ZenML-built image - # Use from_registry to pull the ZenML image with authentication - # and install Modal dependencies + # Modal will automatically cache layers and reuse when possible + logger.info(f"Creating Modal image from base: {image_name}") zenml_image = ( modal.Image.from_registry(image_name, secret=registry_secret) .pip_install("modal") # Install Modal in the container @@ -245,6 +273,41 @@ def build_modal_image( return zenml_image +def generate_sandbox_tags( + pipeline_name: str, + deployment_id: str, + execution_mode: str, + step_name: Optional[str] = None, + run_id: Optional[str] = None, +) -> Dict[str, str]: + """Generate tags for Modal sandboxes. + + Args: + pipeline_name: Name of the pipeline + deployment_id: ZenML deployment ID + execution_mode: Execution mode (PIPELINE or PER_STEP) + step_name: Step name (for PER_STEP mode) + run_id: Pipeline run ID + + Returns: + Dictionary of tags for the sandbox + """ + tags = { + "zenml_pipeline": pipeline_name, + "zenml_deployment_id": deployment_id, + "zenml_execution_mode": execution_mode, + "zenml_component": "modal_orchestrator", + } + + if step_name: + tags["zenml_step"] = step_name + + if run_id: + tags["zenml_run_id"] = run_id + + return tags + + def create_modal_stack_validator() -> StackValidator: """Create a stack validator for Modal components. From bc1f4881b3baf901f766ec74704d0ba8b14add28 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 11:11:36 +0200 Subject: [PATCH 32/77] Refactor Modal orchestrator and image building functions --- .../modal/orchestrators/modal_orchestrator.py | 1 + .../modal_orchestrator_entrypoint.py | 24 ++++++++++++------- src/zenml/integrations/modal/utils.py | 9 ++++--- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 8e707515ee3..f5bc15cf1ca 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -355,6 +355,7 @@ async def _execute_pipeline_sandbox( with modal.enable_output(): # Create sandbox with the entrypoint command + # Note: Modal sandboxes inherit environment from the image sb = await modal.Sandbox.create.aio( *entrypoint_command, # Pass as separate arguments to avoid shell quoting issues image=zenml_image, diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index c6566aa4a20..c706110e3de 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -16,7 +16,7 @@ import argparse import asyncio import os -from typing import Any, Dict, cast +from typing import Any, Dict, List, Optional, cast from uuid import UUID import modal @@ -154,15 +154,15 @@ async def _execute_step_sandbox( app_name: str, step_name: str, zenml_image: Any, - entrypoint_command: list, + entrypoint_command: List[str], deployment: Any, - gpu_values: str = None, - cpu_count: int = None, - memory_mb: int = None, - cloud: str = None, - region: str = None, + gpu_values: Optional[str] = None, + cpu_count: Optional[int] = None, + memory_mb: Optional[int] = None, + cloud: Optional[str] = None, + region: Optional[str] = None, timeout: int = 86400, - environment_name: str = None, + environment_name: Optional[str] = None, ) -> None: """Execute a single step using Modal sandbox. @@ -278,7 +278,13 @@ def finalize_run( def main() -> None: - """Entrypoint of the Modal orchestrator sandbox.""" + """Entrypoint of the Modal orchestrator sandbox. + + This entrypoint is used to execute the pipeline in a Modal sandbox. + + Raises: + Exception: If the pipeline execution fails. + """ logger.info("Modal orchestrator sandbox started.") args = parse_args() diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 0e38d6b683c..aafc6d65ea5 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -210,21 +210,17 @@ def _generate_image_cache_key( return hashlib.sha256(cache_input.encode()).hexdigest()[:12] -# Removed _get_cached_modal_image_name as we're using Modal's internal caching - - def build_modal_image( image_name: str, stack: "Stack", environment: Dict[str, str], ) -> Any: - """Build a Modal image from a ZenML-built Docker image with caching. + """Build a Modal image from a ZenML-built Docker image. Args: image_name: The name of the Docker image to use as base. stack: The ZenML stack containing container registry. environment: Environment variables to set in the image. - force_rebuild: Force rebuilding even if cached image exists. Returns: The configured Modal image. @@ -261,6 +257,9 @@ def build_modal_image( } ) + # Build new Modal image and register it with consistent name + logger.info(f"🔨 Building Modal image from base: {image_name}") + # Build Modal image from the ZenML-built image # Modal will automatically cache layers and reuse when possible logger.info(f"Creating Modal image from base: {image_name}") From 32b5571109aeaada8918ac0e0e4f0f927f0c0821 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 11:38:22 +0200 Subject: [PATCH 33/77] Added Modal step operator setup guide and examples --- .../component-guide/orchestrators/modal.md | 330 ++++++++++++++---- .../component-guide/step-operators/modal.md | 40 ++- 2 files changed, 306 insertions(+), 64 deletions(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index 716ccc1b079..76923b55c7d 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -46,7 +46,24 @@ The Modal orchestrator runs on Modal's cloud infrastructure, so you don't need t ## How to use it -To use the Modal orchestrator, we need: +### Quick Start (5 minutes) + +```bash +# 1. Install Modal integration +zenml integration install modal + +# 2. Setup Modal authentication +modal setup + +# 3. Register orchestrator and run +zenml orchestrator register modal_orch --flavor=modal --synchronous=true +zenml stack update -o modal_orch +python my_pipeline.py +``` + +### Full Setup Requirements + +To use the Modal orchestrator, you need: * The ZenML `modal` integration installed. If you haven't done so, run: ```shell @@ -116,10 +133,18 @@ You can access the Modal dashboard at [modal.com/apps](https://modal.com/apps) t ZenML offers both a [Modal orchestrator](modal.md) and a [Modal step operator](../step-operators/modal.md). Choose based on your needs: -- **Modal Orchestrator**: Runs entire pipelines on Modal's infrastructure. Best for complete pipeline execution with consistent resource requirements. -- **Modal Step Operator**: Runs individual steps on Modal while keeping orchestration local. Best for selectively running compute-intensive steps (like training) on Modal while keeping other steps local. - -Use the orchestrator for full cloud execution, use the step operator for hybrid local/cloud workflows. +| Feature | Modal Step Operator | Modal Orchestrator | +|---------|-------------------|-------------------| +| **Execution Scope** | Individual steps only | Entire pipeline | +| **Orchestration** | Local ZenML | Remote Modal | +| **Resource Flexibility** | Per-step resources | Pipeline-wide resources | +| **Cost Model** | Pay per step execution | Pay per pipeline execution | +| **Setup Complexity** | Simple | Requires remote ZenML | +| **Best For** | Hybrid workflows, selective GPU usage | Full cloud execution, production | + +**Quick Decision Guide**: +- **Use Step Operator**: Need GPUs for only some steps, have local data dependencies, want hybrid local/cloud workflow +- **Use Orchestrator**: Want full cloud execution, production deployment, consistent resource requirements {% endhint %} The Modal orchestrator uses two types of settings following ZenML's standard pattern: @@ -134,16 +159,36 @@ The Modal orchestrator uses two types of settings following ZenML's standard pat - `region` - Cloud region preference - `cloud` - Cloud provider selection - `modal_environment` - Modal environment name (e.g., "main", "dev", "prod") - - `timeout`, `min_containers`, `max_containers` - Performance settings + - `execution_mode` - Execution strategy: "pipeline" (default) or "per_step" + - `max_parallelism` - Maximum concurrent steps (for "per_step" mode) + - `timeout` - Maximum execution time in seconds - `synchronous` - Wait for completion (True) or fire-and-forget (False) {% hint style="info" %} **GPU Configuration**: Use `ResourceSettings.gpu_count` to specify how many GPUs you need, and `ModalOrchestratorSettings.gpu` to specify what type of GPU. Modal will combine these automatically (e.g., `gpu_count=2` + `gpu="A100"` becomes `"A100:2"`). {% endhint %} -### Additional configuration +### Configuration Overview -Here's how to configure both types of settings: +**Simple Configuration (Recommended):** + +```python +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorSettings +) + +# Simple GPU pipeline +@pipeline( + settings={ + "orchestrator": ModalOrchestratorSettings(gpu="A100") + } +) +def my_gpu_pipeline(): + # Your pipeline steps here + ... +``` + +**Advanced Configuration:** ```python from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( @@ -153,21 +198,21 @@ from zenml.config import ResourceSettings # Configure Modal-specific settings modal_settings = ModalOrchestratorSettings( - gpu="A100", # GPU type (optional) - region="us-east-1", # Preferred region - cloud="aws", # Cloud provider - modal_environment="main", # Modal environment name - timeout=3600, # 1 hour timeout - min_containers=1, # Keep warm containers - max_containers=10, # Scale up to 10 containers - synchronous=True, # Wait for completion + gpu="A100", # GPU type (optional) + region="us-east-1", # Preferred region + cloud="aws", # Cloud provider + modal_environment="production", # Modal environment name + execution_mode="pipeline", # "pipeline" (default) or "per_step" + max_parallelism=3, # Max concurrent steps (per_step mode) + timeout=3600, # 1 hour timeout + synchronous=True, # Wait for completion ) # Configure hardware resources (quantities) resource_settings = ResourceSettings( - cpu_count=16, # Number of CPU cores - memory="32GB", # 32GB RAM - gpu_count=1 # Number of GPUs (combined with gpu type below) + cpu_count=16, # Number of CPU cores + memory="32GB", # 32GB RAM + gpu_count=1 # Number of GPUs ) @pipeline( @@ -184,9 +229,13 @@ def my_modal_pipeline(): ### Resource configuration {% hint style="info" %} -**Pipeline-Level Resources**: The Modal orchestrator uses pipeline-level resource settings to configure the Modal function for the entire pipeline. All steps share the same Modal function resources. Configure resources at the `@pipeline` level for best results. +**Resource Configuration by Execution Mode**: + +**Pipeline Mode**: All steps share the same resources configured at the pipeline level. Configure resources at the `@pipeline` level for best results. -**Resource Fallback Behavior**: If no pipeline-level resource settings are provided, the orchestrator will automatically use the highest resource requirements found across all steps in the pipeline. This ensures adequate resources for all steps while maintaining the single-function execution model. +**Per-Step Mode**: Each step can have its own resource configuration! You can mix different GPUs, CPU, and memory settings across steps. Pipeline-level settings serve as defaults that individual steps can override. + +**Resource Fallback Behavior**: If no pipeline-level resource settings are provided, the orchestrator will automatically use the highest resource requirements found across all steps in the pipeline. {% endhint %} You can configure pipeline-wide resource requirements using `ResourceSettings` for hardware resources and `ModalOrchestratorSettings` for Modal-specific configurations: @@ -227,18 +276,114 @@ def second_step(): ... ``` +### Execution Modes + +The Modal orchestrator supports two execution modes: + +#### Pipeline Mode (Default - Recommended) + +```python +modal_settings = ModalOrchestratorSettings( + execution_mode="pipeline", # Execute entire pipeline in one sandbox + gpu="A100" +) +``` + +**Benefits:** +- **Fastest execution**: Entire pipeline runs in single sandbox +- **Cost-effective**: Minimal overhead and resource usage +- **Simple**: All steps share same environment and resources + +**Best for:** Most ML pipelines, production workloads, cost optimization + +#### Per-Step Mode (Advanced) + +```python +modal_settings = ModalOrchestratorSettings( + execution_mode="per_step", # Execute each step in separate sandbox + max_parallelism=3, # Run up to 3 steps concurrently + gpu="T4" # Default GPU for steps (can be overridden per step) +) +``` + +**Benefits:** +- **Granular control**: Each step runs in isolated sandbox with its own resources +- **Parallel execution**: Steps can run concurrently based on dependencies +- **Step-specific resources**: Each step can have different CPU, memory, GPU configurations +- **Resource optimization**: Use expensive GPUs only for steps that need them + +**Best for:** Complex pipelines with varying resource needs, debugging individual steps, cost optimization + +**Per-Step Resource Configuration:** +In per-step mode, you can configure different resources for each step, enabling powerful resource optimization: + +```python +@pipeline( + settings={ + "orchestrator": ModalOrchestratorSettings( + execution_mode="per_step", + max_parallelism=2, + gpu="T4" # Default GPU for steps + ) + } +) +def mixed_resource_pipeline(): + # Light preprocessing - no GPU needed + preprocess_data() + + # Heavy training - needs A100 GPU + train_model() + + # Evaluation - T4 GPU sufficient + evaluate_model() + +@step( + settings={ + "resources": ResourceSettings(cpu_count=2, memory="4GB") # CPU-only step + } +) +def preprocess_data(): + # Light CPU work - no GPU, saves costs + pass + +@step( + settings={ + "orchestrator": ModalOrchestratorSettings(gpu="A100"), # Override to A100 + "resources": ResourceSettings(gpu_count=1, memory="32GB") + } +) +def train_model(): + # Heavy training with A100 GPU and 32GB RAM + pass + +@step( + settings={ + "resources": ResourceSettings(gpu_count=1, memory="16GB") # Uses pipeline default T4 + } +) +def evaluate_model(): + # Evaluation with T4 GPU and 16GB RAM + pass +``` + +**Key Benefits of Per-Step Resource Configuration:** +- **Cost optimization**: Use expensive GPUs (A100, H100) only for steps that need them +- **Resource efficiency**: Match CPU/memory to actual step requirements +- **Parallel execution**: Steps with different resources can run concurrently +- **Flexibility**: Each step gets exactly the resources it needs + ### Sandbox Architecture The Modal orchestrator uses a simplified sandbox-based architecture: - **Persistent apps per pipeline**: Each pipeline gets its own Modal app that stays alive -- **Dynamic sandboxes for execution**: Each pipeline run creates a fresh sandbox for complete isolation +- **Dynamic sandboxes for execution**: Each pipeline run creates fresh sandboxes for complete isolation - **Built-in output streaming**: Modal automatically handles log streaming and output capture - **Maximum flexibility**: Sandboxes can execute arbitrary commands and provide better isolation This architecture provides optimal benefits: - **Simplicity**: No complex app deployment or time window management -- **Flexibility**: Sandboxes offer more dynamic execution capabilities than functions +- **Flexibility**: Sandboxes offer more dynamic execution capabilities - **Isolation**: Each run gets a completely fresh execution environment - **Performance**: Persistent apps eliminate deployment overhead @@ -307,32 +452,39 @@ Available GPU types include: - `A100` - High-performance for large model training - `H100` - Latest generation for maximum performance -**Examples of GPU configurations (applied to entire pipeline):** +**Examples of GPU configurations:** ```python -# Pipeline with GPU - configure on first step or pipeline level +# Simple GPU pipeline (recommended) @pipeline( settings={ - "resources": ResourceSettings(gpu_count=1), - "orchestrator": ModalOrchestratorSettings(gpu="A100") + "orchestrator": ModalOrchestratorSettings( + gpu="A100", + execution_mode="pipeline" # Default: entire pipeline in one sandbox + ), + "resources": ResourceSettings(gpu_count=1) } ) -def gpu_pipeline(): - # All steps in this pipeline will have access to 1x A100 GPU +def simple_gpu_pipeline(): + # All steps run in same sandbox with 1x A100 GPU step_one() step_two() -# Multiple GPUs - configure at pipeline level +# Per-step execution with multiple GPUs @pipeline( settings={ - "resources": ResourceSettings(gpu_count=4), - "orchestrator": ModalOrchestratorSettings(gpu="A100") + "orchestrator": ModalOrchestratorSettings( + gpu="A100", + execution_mode="per_step", # Each step in separate sandbox + max_parallelism=2 # Run up to 2 steps concurrently + ), + "resources": ResourceSettings(gpu_count=4) } ) def multi_gpu_pipeline(): - # All steps in this pipeline will have access to 4x A100 GPUs - training_step() - evaluation_step() + # Each step runs in separate sandbox with 4x A100 GPUs + training_step() # Sandbox 1: 4x A100 + evaluation_step() # Sandbox 2: 4x A100 (can run in parallel) ``` ### Synchronous vs Asynchronous execution @@ -428,66 +580,126 @@ def my_pipeline(): This ensures your pipelines start executing quickly by reusing persistent apps and creating fresh sandboxes for isolation. -{% hint style="warning" %} -**Cost Implications and Optimization** +### Cost Optimization -Understanding Modal orchestrator costs with sandbox architecture: +{% hint style="warning" %} +**Understanding Modal Costs** -**Execution Costs**: +**Execution Model**: - **Pay-per-use**: You only pay when sandboxes are actively running - **No idle costs**: Persistent apps don't incur costs when not executing -- **Sandbox overhead**: Minimal - sandboxes start quickly on persistent apps +- **Resource-based pricing**: Cost depends on CPU, memory, and GPU usage -**Resource Optimization**: -- **GPU usage**: Only allocated during actual pipeline execution -- **Memory and CPU**: Charged only for sandbox execution time -- **Storage**: Docker images are cached across runs on persistent apps +**Execution Mode Impact**: +- **Pipeline mode**: Most cost-effective - single sandbox for entire pipeline +- **Per-step mode**: Higher overhead - separate sandbox per step, but enables parallelism + +**Cost Examples (approximate)**: +```python +# Cost-effective: Pipeline mode +# Single A100 GPU for 30-minute pipeline = ~$0.80 +ModalOrchestratorSettings(execution_mode="pipeline", gpu="A100") + +# Higher cost: Per-step mode +# A100 GPU per step (5 steps × 6 min each) = ~$0.80 +# But steps can run in parallel, reducing total time +ModalOrchestratorSettings(execution_mode="per_step", gpu="A100") +``` **Cost Optimization Strategies**: -- **Efficient pipelines**: Optimize pipeline execution time to reduce costs -- **Right-size resources**: Use appropriate CPU/memory/GPU for your workload -- **Regional selection**: Choose regions close to your data sources -- **Timeout management**: Set appropriate timeouts to avoid runaway costs +- **Use pipeline mode** for most workloads (fastest, cheapest) +- **Right-size resources**: Don't use A100s for simple preprocessing +- **Optimize pipeline execution time** to reduce sandbox runtime +- **Choose efficient regions** close to your data sources +- **Set appropriate timeouts** to avoid runaway costs -Monitor your Modal dashboard to track sandbox execution time and resource usage for cost optimization. +Monitor your Modal dashboard to track sandbox execution time and resource usage. {% endhint %} ## Best practices -1. **Use pipeline mode for production**: The default `pipeline` execution mode runs your entire pipeline in one function, minimizing overhead and cost. +1. **Start with pipeline mode**: The default `pipeline` execution mode runs your entire pipeline in one sandbox, minimizing overhead and cost. Switch to `per_step` only if you need granular control. 2. **Separate resource and orchestrator settings**: Use `ResourceSettings` for hardware (CPU, memory, GPU count) and `ModalOrchestratorSettings` for Modal-specific configurations (GPU type, region, etc.). 3. **Configure appropriate timeouts**: Set realistic timeouts for your workloads: ```python modal_settings = ModalOrchestratorSettings( - timeout=7200 # 2 hours + timeout=7200, # 2 hours + execution_mode="pipeline" # Recommended for most cases ) ``` -4. **Choose the right region**: Select regions close to your data sources to minimize transfer costs and latency. +4. **Choose execution mode based on needs**: + - **Pipeline mode**: For production, cost optimization, simple workflows + - **Per-step mode**: For debugging, heterogeneous resources, or parallel execution + +5. **Use appropriate GPU types**: Match GPU types to your workload requirements: + - `T4`: Inference, light training, cost-sensitive workloads + - `A100`: Large model training, high-performance computing + - `H100`: Latest generation, maximum performance + +6. **Optimize for your execution mode**: + - **Pipeline mode**: Optimize total pipeline runtime + - **Per-step mode**: Set appropriate `max_parallelism` (typically 2-4) -5. **Use appropriate GPU types**: Match GPU types to your workload requirements - don't use A100s for simple inference tasks. +7. **Monitor resource usage**: Use Modal's dashboard to track your resource consumption and optimize accordingly. -6. **Monitor resource usage**: Use Modal's dashboard to track your resource consumption and optimize accordingly. +8. **Environment separation**: Use separate Modal environments (`dev`, `staging`, `prod`) for different deployment stages. ## Troubleshooting ### Common issues -1. **Authentication errors**: Ensure your Modal token is correctly configured and has the necessary permissions. +1. **Authentication errors**: + ```bash + # Verify Modal setup + modal auth show + + # Re-authenticate if needed + modal setup + ``` + +2. **Image build failures**: + - Check Docker registry credentials in your ZenML stack + - Verify your Docker daemon is running + - Ensure base image compatibility with Modal's environment + +3. **Resource allocation errors**: + ``` + Error: No capacity for requested GPU type + ``` + **Solution**: Try different regions or GPU types, or reduce `max_parallelism` in per-step mode + +4. **Pipeline timeouts**: + ```python + # Increase timeout for long-running pipelines + ModalOrchestratorSettings(timeout=14400) # 4 hours + ``` + +5. **Per-step mode issues**: + - **Too many concurrent steps**: Reduce `max_parallelism` + - **Resource conflicts**: Ensure adequate quota for parallel execution + - **Step dependencies**: Verify your pipeline DAG allows for parallelism -2. **Image build failures**: Check that your Docker registry credentials are properly configured in your ZenML stack. +### Performance troubleshooting -3. **Resource limits**: If you hit resource limits, consider breaking large steps into smaller ones or requesting quota increases from Modal. +**Slow execution in per-step mode**: +- Reduce `max_parallelism` to avoid resource contention +- Consider switching to `pipeline` mode for better performance +- Check Modal dashboard for sandbox startup times -4. **Network timeouts**: For long-running steps, ensure your timeout settings are appropriate. +**Memory issues**: +- Increase memory allocation in `ResourceSettings` +- For pipeline mode: ensure total memory covers all steps +- For per-step mode: configure per-step memory requirements ### Getting help - Check the [Modal documentation](https://modal.com/docs) for platform-specific issues -- Monitor your functions in the [Modal dashboard](https://modal.com/apps) +- Monitor your sandboxes in the [Modal dashboard](https://modal.com/apps) - Use `zenml logs` to view detailed pipeline execution logs +- Check ZenML step operator docs for [hybrid workflows](../step-operators/modal.md) For more information and a full list of configurable attributes of the Modal orchestrator, check out the [SDK Docs](https://sdkdocs.zenml.io/latest/integration_code_docs/integrations-modal.html#zenml.integrations.modal.orchestrators). \ No newline at end of file diff --git a/docs/book/component-guide/step-operators/modal.md b/docs/book/component-guide/step-operators/modal.md index 1d0bd36fa0b..2989aa5b146 100644 --- a/docs/book/component-guide/step-operators/modal.md +++ b/docs/book/component-guide/step-operators/modal.md @@ -13,10 +13,18 @@ description: Executing individual steps in Modal. ZenML offers both a [Modal step operator](modal.md) and a [Modal orchestrator](../orchestrators/modal.md). Choose based on your needs: -- **Modal Step Operator**: Runs individual steps on Modal while keeping orchestration local. Best for selectively running compute-intensive steps (like training) on Modal while keeping other steps local. -- **Modal Orchestrator**: Runs entire pipelines on Modal's infrastructure. Best for complete pipeline execution with consistent resource requirements. - -Use the step operator for hybrid local/cloud workflows, use the orchestrator for full cloud execution. +| Feature | Modal Step Operator | Modal Orchestrator | +|---------|-------------------|-------------------| +| **Execution Scope** | Individual steps only | Entire pipeline | +| **Orchestration** | Local ZenML | Remote Modal | +| **Resource Flexibility** | Per-step resources | Pipeline-wide resources | +| **Cost Model** | Pay per step execution | Pay per pipeline execution | +| **Setup Complexity** | Simple | Requires remote ZenML | +| **Best For** | Hybrid workflows, selective GPU usage | Full cloud execution, production | + +**Quick Decision Guide**: +- **Use Step Operator**: Need GPUs for only some steps, have local data dependencies, want hybrid local/cloud workflow +- **Use Orchestrator**: Want full cloud execution, production deployment, consistent resource requirements {% endhint %} You should use the Modal step operator if: @@ -47,7 +55,29 @@ To use the Modal step operator: ### How to use it -To use the Modal step operator, we need: +#### Quick Start (5 minutes) + +```bash +# 1. Install Modal integration +zenml integration install modal + +# 2. Setup Modal authentication +modal setup + +# 3. Register step operator +zenml step-operator register modal_step --flavor=modal +zenml stack update -s modal_step + +# 4. Use in your code +@step(step_operator="modal_step") +def train_model(): + # This step runs on Modal + pass +``` + +#### Full Setup Requirements + +To use the Modal step operator, you need: * The ZenML `modal` integration installed. If you haven't done so, run From 31b2c135e30871f200db13d62e27b6e81a7f5abe Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 12:00:39 +0200 Subject: [PATCH 34/77] Refactor configuration options inheritance logic --- .../integrations/modal/flavors/modal_orchestrator_flavor.py | 4 ++-- .../integrations/modal/flavors/modal_step_operator_flavor.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 7b40b942369..06cc739e2a4 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -79,16 +79,16 @@ class ModalOrchestratorConfig( token_id: Modal API token ID (ak-xxxxx format) for authentication. token_secret: Modal API token secret (as-xxxxx format) for authentication. workspace: Modal workspace name (optional). - modal_environment: Modal environment name (optional). Note: If token_id and token_secret are not provided, falls back to Modal's default authentication (~/.modal.toml). + All other configuration options (modal_environment, gpu, region, etc.) + are inherited from ModalOrchestratorSettings. """ token_id: Optional[str] = SecretField(default=None) token_secret: Optional[str] = SecretField(default=None) workspace: Optional[str] = None - modal_environment: Optional[str] = None @property def is_remote(self) -> bool: diff --git a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py index 53348756070..17615616e5e 100644 --- a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py @@ -61,16 +61,16 @@ class ModalStepOperatorConfig( token_id: Modal API token ID (ak-xxxxx format) for authentication. token_secret: Modal API token secret (as-xxxxx format) for authentication. workspace: Modal workspace name (optional). - modal_environment: Modal environment name (optional). Note: If token_id and token_secret are not provided, falls back to Modal's default authentication (~/.modal.toml). + All other configuration options (modal_environment, gpu, region, etc.) + are inherited from ModalStepOperatorSettings. """ token_id: Optional[str] = SecretField(default=None) token_secret: Optional[str] = SecretField(default=None) workspace: Optional[str] = None - modal_environment: Optional[str] = None @property def is_remote(self) -> bool: From 4a90c3d508c41bce978402254e792aaf1ad0d676 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 12:07:49 +0200 Subject: [PATCH 35/77] Update modal environment flag format in step operators --- docs/book/component-guide/step-operators/modal.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/book/component-guide/step-operators/modal.md b/docs/book/component-guide/step-operators/modal.md index 2989aa5b146..aab64e0ec93 100644 --- a/docs/book/component-guide/step-operators/modal.md +++ b/docs/book/component-guide/step-operators/modal.md @@ -110,7 +110,7 @@ zenml step-operator register \ --token-id= \ --token-secret= \ --workspace= \ - --modal-environment= + --modal_environment= zenml stack update -s ... ``` @@ -235,7 +235,7 @@ zenml step-operator register modal_prod \ --token-id= \ --token-secret= \ --workspace="production-workspace" \ - --modal-environment="production" + --modal_environment="production" ``` ### Resource configuration notes From 312d490847488cdb47bb6bb8528b886dcb6dcff5 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 12:15:41 +0200 Subject: [PATCH 36/77] Update modal orchestrator and step operator flavors --- .../integrations/modal/flavors/modal_orchestrator_flavor.py | 2 +- .../integrations/modal/flavors/modal_step_operator_flavor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 06cc739e2a4..ae0c2b1aa7c 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -82,7 +82,7 @@ class ModalOrchestratorConfig( Note: If token_id and token_secret are not provided, falls back to Modal's default authentication (~/.modal.toml). - All other configuration options (modal_environment, gpu, region, etc.) + All other configuration options (modal_environment, gpu, region, etc.) are inherited from ModalOrchestratorSettings. """ diff --git a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py index 17615616e5e..942ee91f166 100644 --- a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py @@ -64,7 +64,7 @@ class ModalStepOperatorConfig( Note: If token_id and token_secret are not provided, falls back to Modal's default authentication (~/.modal.toml). - All other configuration options (modal_environment, gpu, region, etc.) + All other configuration options (modal_environment, gpu, region, etc.) are inherited from ModalStepOperatorSettings. """ From 11d63b377770eaaf8ebc02df308ae91e3835ac94 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 24 Jun 2025 22:26:36 +0200 Subject: [PATCH 37/77] Update entrypoint configuration description --- .../modal_orchestrator_entrypoint_configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py index ae4398bb789..db0f478f631 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py @@ -24,7 +24,7 @@ class ModalOrchestratorEntrypointConfiguration: - """Entrypoint configuration for the Modal orchestrator sandbox.""" + """Entrypoint configuration for the orchestrator sandbox.""" @classmethod def get_entrypoint_options(cls) -> Set[str]: From 75e0615869d5e2b59987390b5911bfb3dcd1d6b5 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Wed, 25 Jun 2025 21:28:37 +0200 Subject: [PATCH 38/77] Update configuration section heading to "Configuration Examples". --- docs/book/component-guide/orchestrators/modal.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index 76923b55c7d..171a8d75cc4 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -168,7 +168,7 @@ The Modal orchestrator uses two types of settings following ZenML's standard pat **GPU Configuration**: Use `ResourceSettings.gpu_count` to specify how many GPUs you need, and `ModalOrchestratorSettings.gpu` to specify what type of GPU. Modal will combine these automatically (e.g., `gpu_count=2` + `gpu="A100"` becomes `"A100:2"`). {% endhint %} -### Configuration Overview +### Configuration Examples **Simple Configuration (Recommended):** From 4fa278fc1390db365db4f2bacbe1b97bd2f56109 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Wed, 25 Jun 2025 21:30:27 +0200 Subject: [PATCH 39/77] Create deep copy of settings for Modal orchestrator --- .../modal_orchestrator_entrypoint.py | 18 +++++++++------ src/zenml/integrations/modal/utils.py | 23 +------------------ 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index c706110e3de..81af86ba8ca 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -16,6 +16,7 @@ import argparse import asyncio import os +from copy import deepcopy from typing import Any, Dict, List, Optional, cast from uuid import UUID @@ -86,6 +87,9 @@ def run_step_on_modal( """ logger.info(f"Running step '{step_name}' in Modal sandbox") + # Create a deep copy of pipeline-level settings to avoid modifying the original + step_settings_copy = deepcopy(settings) + # Get step-specific settings if any step_config = deployment.step_configurations[step_name].config step_settings = step_config.settings.get("orchestrator.modal", None) @@ -93,12 +97,12 @@ def run_step_on_modal( step_modal_settings = ModalOrchestratorSettings.model_validate( step_settings.model_dump() if step_settings else {} ) - # Merge with pipeline-level settings + # Merge with pipeline-level settings copy for key, value in step_modal_settings.model_dump( exclude_unset=True ).items(): if value is not None: - setattr(settings, key, value) + setattr(step_settings_copy, key, value) # Get resource settings for this step resource_settings = step_config.settings.get(RESOURCE_SETTINGS_KEY) @@ -112,7 +116,7 @@ def run_step_on_modal( zenml_image = build_modal_image(image_name, active_stack, environment) # Configure resources - gpu_values = get_gpu_values(settings.gpu, resource_settings) + gpu_values = get_gpu_values(step_settings_copy.gpu, resource_settings) cpu_count, memory_mb = get_resource_values(resource_settings) # Create step entrypoint command @@ -138,10 +142,10 @@ def run_step_on_modal( gpu_values=gpu_values, cpu_count=cpu_count, memory_mb=memory_mb, - cloud=settings.cloud, - region=settings.region, - timeout=settings.timeout, - environment_name=settings.modal_environment, + cloud=step_settings_copy.cloud, + region=step_settings_copy.region, + timeout=step_settings_copy.timeout, + environment_name=step_settings_copy.modal_environment, ) ) logger.info(f"Step {step_name} completed successfully") diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index aafc6d65ea5..c7ba9352e20 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -13,7 +13,6 @@ # permissions and limitations under the License. """Shared utilities for Modal integration components.""" -import hashlib import os from typing import Any, Dict, List, Optional, Tuple, Union @@ -193,23 +192,6 @@ def get_resource_values( return cpu_count, memory_mb -def _generate_image_cache_key( - image_name: str, environment: Dict[str, str] -) -> str: - """Generate a cache key for Modal image based on inputs. - - Args: - image_name: Base Docker image name - environment: Environment variables - - Returns: - Hash string to use as cache key - """ - # Create deterministic string from inputs - cache_input = f"{image_name}|{sorted(environment.items())}" - return hashlib.sha256(cache_input.encode()).hexdigest()[:12] - - def build_modal_image( image_name: str, stack: "Stack", @@ -236,10 +218,7 @@ def build_modal_image( "it is correctly configured." ) - # Generate cache key for this image configuration - cache_key = _generate_image_cache_key(image_name, environment) - - logger.info(f"Building Modal image (cache key: {cache_key})") + logger.info("Building Modal image") logger.info(f"Base image: {image_name}") if docker_creds := stack.container_registry.credentials: From 79a449ac6c7229c2663321a0bcfc7cb142c193a8 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 26 Jun 2025 10:25:38 +0200 Subject: [PATCH 40/77] Add ModalOrchestrator for running pipelines on Modal platform --- docs/book/component-guide/orchestrators/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/book/component-guide/orchestrators/README.md b/docs/book/component-guide/orchestrators/README.md index de13007055c..e68be150d24 100644 --- a/docs/book/component-guide/orchestrators/README.md +++ b/docs/book/component-guide/orchestrators/README.md @@ -34,6 +34,7 @@ Out of the box, ZenML comes with a `local` orchestrator already part of the defa | [SkypilotGCPOrchestrator](skypilot-vm.md) | `vm_gcp` | `skypilot[gcp]` | Runs your pipelines in GCP VMs using SkyPilot | | [SkypilotAzureOrchestrator](skypilot-vm.md) | `vm_azure` | `skypilot[azure]` | Runs your pipelines in Azure VMs using SkyPilot | | [HyperAIOrchestrator](hyperai.md) | `hyperai` | `hyperai` | Runs your pipeline in HyperAI.ai instances. | +| [ModalOrchestrator](modal.md) | `modal` | `modal` | Runs your pipelines on Modal's serverless cloud platform. | | [Custom Implementation](custom.md) | _custom_ | | Extend the orchestrator abstraction and provide your own implementation | If you would like to see the available flavors of orchestrators, you can use the command: From 5be346dd40fac411047d2f71c10bc1869186bf93 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 11 Jul 2025 21:53:47 +0200 Subject: [PATCH 41/77] Refactor building Modal image with deployment caching --- .../modal/orchestrators/modal_orchestrator.py | 145 +++++++----------- .../modal_orchestrator_entrypoint.py | 1 - src/zenml/integrations/modal/utils.py | 87 ++++++++++- 3 files changed, 137 insertions(+), 96 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index f5bc15cf1ca..83e67fc9b41 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -39,10 +39,10 @@ ) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, - build_modal_image, create_modal_stack_validator, generate_sandbox_tags, get_gpu_values, + get_or_build_modal_image, get_resource_settings_from_deployment, get_resource_values, setup_modal_client, @@ -128,27 +128,6 @@ def get_orchestrator_run_id(self) -> str: f"{ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID}." ) - def _build_modal_image( - self, - deployment: "PipelineDeploymentResponse", - stack: "Stack", - environment: Dict[str, str], - ) -> Any: - """Build the Modal image for pipeline execution. - - Args: - deployment: The pipeline deployment. - stack: The stack the pipeline will run on. - environment: Environment variables to set. - - Returns: - The configured Modal image. - """ - # Get the ZenML-built image that contains all pipeline code - image_name = self.get_image(deployment=deployment) - - return build_modal_image(image_name, stack, environment) - def prepare_or_run_pipeline( self, deployment: "PipelineDeploymentResponse", @@ -209,9 +188,6 @@ def prepare_or_run_pipeline( deployment, RESOURCE_SETTINGS_KEY ) - # Build Modal image - zenml_image = self._build_modal_image(deployment, stack, environment) - # Configure resources from resource settings gpu_values = get_gpu_values(settings.gpu, resource_settings) cpu_count, memory_mb = get_resource_values(resource_settings) @@ -225,6 +201,23 @@ def prepare_or_run_pipeline( ) app_name = f"zenml-pipeline-{pipeline_name}" + # Create Modal app for caching and execution + app = modal.App.lookup( + app_name, + create_if_missing=True, + environment_name=settings.modal_environment + or self.config.modal_environment, + ) + + # Get or build Modal image with caching based on deployment ID + image_name = self.get_image(deployment=deployment) + zenml_image = get_or_build_modal_image( + image_name=image_name, + stack=stack, + deployment_id=str(deployment.id), + app=app, + ) + # Build entrypoint command and args for the orchestrator sandbox command = ( ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() @@ -238,59 +231,29 @@ def prepare_or_run_pipeline( ) entrypoint_command = command + args - # Execute using sandbox based on execution mode + # Execute using sandbox try: - if execution_mode == ModalExecutionMode.PIPELINE: - # Execute entire pipeline in one sandbox (fastest) - asyncio.run( - self._execute_pipeline_sandbox( - app_name=app_name, - zenml_image=zenml_image, - entrypoint_command=entrypoint_command, - deployment=deployment, - execution_mode=execution_mode.value, - run_id=str(placeholder_run.id) - if placeholder_run - else None, - gpu_values=gpu_values, - cpu_count=cpu_count, - memory_mb=memory_mb, - cloud=settings.cloud or self.config.cloud, - region=settings.region or self.config.region, - timeout=settings.timeout or self.config.timeout, - environment_name=settings.modal_environment - or self.config.modal_environment, - synchronous=settings.synchronous - if hasattr(settings, "synchronous") - else self.config.synchronous, - ) - ) - else: - # PER_STEP mode: Execute each step in separate sandbox - # This is handled by the entrypoint using ThreadedDagRunner - asyncio.run( - self._execute_pipeline_sandbox( - app_name=app_name, - zenml_image=zenml_image, - entrypoint_command=entrypoint_command, - deployment=deployment, - execution_mode=execution_mode.value, - run_id=str(placeholder_run.id) - if placeholder_run - else None, - gpu_values=gpu_values, - cpu_count=cpu_count, - memory_mb=memory_mb, - cloud=settings.cloud or self.config.cloud, - region=settings.region or self.config.region, - timeout=settings.timeout or self.config.timeout, - environment_name=settings.modal_environment - or self.config.modal_environment, - synchronous=settings.synchronous - if hasattr(settings, "synchronous") - else self.config.synchronous, - ) + asyncio.run( + self._execute_pipeline_sandbox( + app=app, + zenml_image=zenml_image, + entrypoint_command=entrypoint_command, + deployment=deployment, + run_id=str(placeholder_run.id) + if placeholder_run + else None, + gpu_values=gpu_values, + cpu_count=cpu_count, + memory_mb=memory_mb, + cloud=settings.cloud or self.config.cloud, + region=settings.region or self.config.region, + timeout=settings.timeout or self.config.timeout, + environment=environment, + synchronous=settings.synchronous + if hasattr(settings, "synchronous") + else self.config.synchronous, ) + ) except Exception as e: logger.error(f"Pipeline execution failed: {e}") logger.info("Check Modal dashboard for detailed logs") @@ -302,11 +265,10 @@ def prepare_or_run_pipeline( async def _execute_pipeline_sandbox( self, - app_name: str, + app: Any, zenml_image: Any, entrypoint_command: List[str], deployment: "PipelineDeploymentResponse", - execution_mode: str, run_id: Optional[str] = None, gpu_values: Optional[str] = None, cpu_count: Optional[int] = None, @@ -314,17 +276,16 @@ async def _execute_pipeline_sandbox( cloud: Optional[str] = None, region: Optional[str] = None, timeout: int = 86400, - environment_name: Optional[str] = None, + environment: Optional[Dict[str, str]] = None, synchronous: bool = True, ) -> None: """Execute pipeline using Modal sandbox. Args: - app_name: Name of the Modal app + app: Modal app instance zenml_image: Pre-built ZenML Docker image for Modal entrypoint_command: Command to execute in the sandbox deployment: Pipeline deployment for tagging - execution_mode: Execution mode for tagging run_id: Pipeline run ID for tagging gpu_values: GPU configuration string cpu_count: Number of CPU cores @@ -332,15 +293,10 @@ async def _execute_pipeline_sandbox( cloud: Cloud provider to use region: Region to deploy in timeout: Maximum execution timeout - environment_name: Modal environment name + environment: Environment variables for the sandbox synchronous: Whether to wait for completion """ - # Create persistent app (will reuse if exists) - app = modal.App.lookup( - app_name, create_if_missing=True, environment_name=environment_name - ) - - logger.info(f"Using Modal app: {app_name}") + logger.info(f"Using Modal app: {app.name}") logger.info("Creating sandbox for pipeline execution") @@ -348,14 +304,22 @@ async def _execute_pipeline_sandbox( sandbox_tags = generate_sandbox_tags( pipeline_name=deployment.pipeline_configuration.name, deployment_id=str(deployment.id), - execution_mode=execution_mode, + execution_mode="PIPELINE", run_id=run_id, ) logger.info(f"Sandbox tags: {sandbox_tags}") with modal.enable_output(): # Create sandbox with the entrypoint command - # Note: Modal sandboxes inherit environment from the image + # Use a single persistent secret per app for environment variables + secrets = [] + if environment: + secret_name = f"zenml-env-{app.name.replace('-', '_')}" + env_secret = modal.Secret.from_dict( + environment, name=secret_name + ) + secrets.append(env_secret) + sb = await modal.Sandbox.create.aio( *entrypoint_command, # Pass as separate arguments to avoid shell quoting issues image=zenml_image, @@ -366,6 +330,7 @@ async def _execute_pipeline_sandbox( region=region, app=app, timeout=timeout, + secrets=secrets, ) # Set tags on the sandbox for organization diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 81af86ba8ca..3bd99ad0d46 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -195,7 +195,6 @@ async def _execute_step_sandbox( step_tags = generate_sandbox_tags( pipeline_name=deployment.pipeline_configuration.name, deployment_id=str(deployment.id), - execution_mode="PER_STEP", step_name=step_name, ) logger.info(f"Step sandbox tags: {step_tags}") diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index c7ba9352e20..97bfade47db 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -192,17 +192,94 @@ def get_resource_values( return cpu_count, memory_mb +def get_or_build_modal_image( + image_name: str, + stack: "Stack", + deployment_id: str, + app: Any, +) -> Any: + """Get existing Modal image or build new one based on deployment ID. + + Args: + image_name: The name of the Docker image to use as base. + stack: The ZenML stack containing container registry. + deployment_id: The deployment ID for caching. + app: The Modal app to store/retrieve images. + + Returns: + The configured Modal image. + + Raises: + RuntimeError: If no Docker credentials are found. + ValueError: If no container registry is found. + """ + if not stack.container_registry: + raise ValueError( + "No Container registry found in the stack. " + "Please add a container registry and ensure " + "it is correctly configured." + ) + + # Try to get existing image from the app + image_name_key = f"zenml_image_{deployment_id}" + + try: + # Try to look up existing image + existing_image = getattr(app, image_name_key, None) + if existing_image is not None: + logger.info( + f"Using cached Modal image for deployment {deployment_id}" + ) + return existing_image + except Exception: + # If lookup fails, we'll build a new image + pass + + logger.info("Building new Modal image") + logger.info(f"Base image: {image_name}") + + if docker_creds := stack.container_registry.credentials: + docker_username, docker_password = docker_creds + else: + raise RuntimeError( + "No Docker credentials found for the container registry." + ) + + # Create Modal secret for registry authentication + registry_secret = modal.Secret.from_dict( + { + "REGISTRY_USERNAME": docker_username, + "REGISTRY_PASSWORD": docker_password, + } + ) + + # Build new Modal image and register it with consistent name + logger.info(f"🔨 Building Modal image from base: {image_name}") + + # Build Modal image from the ZenML-built image + # Modal will automatically cache layers and reuse when possible + logger.info(f"Creating Modal image from base: {image_name}") + zenml_image = ( + modal.Image.from_registry( + image_name, secret=registry_secret + ).pip_install("modal") # Install Modal in the container + ) + + # Store the image in the app for future use + setattr(app, image_name_key, zenml_image) + + return zenml_image + + def build_modal_image( image_name: str, stack: "Stack", - environment: Dict[str, str], ) -> Any: """Build a Modal image from a ZenML-built Docker image. Args: image_name: The name of the Docker image to use as base. stack: The ZenML stack containing container registry. - environment: Environment variables to set in the image. Returns: The configured Modal image. @@ -243,9 +320,9 @@ def build_modal_image( # Modal will automatically cache layers and reuse when possible logger.info(f"Creating Modal image from base: {image_name}") zenml_image = ( - modal.Image.from_registry(image_name, secret=registry_secret) - .pip_install("modal") # Install Modal in the container - .env(environment) + modal.Image.from_registry( + image_name, secret=registry_secret + ).pip_install("modal") # Install Modal in the container ) return zenml_image From c12a37ff7f0443ff69bbf87a798a341d474fc182 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 11 Jul 2025 22:14:17 +0200 Subject: [PATCH 42/77] Refactor environment variable secrets handling --- .../integrations/modal/orchestrators/modal_orchestrator.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 83e67fc9b41..83dcbed2d68 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -311,13 +311,10 @@ async def _execute_pipeline_sandbox( with modal.enable_output(): # Create sandbox with the entrypoint command - # Use a single persistent secret per app for environment variables + # Use secrets for environment variables secrets = [] if environment: - secret_name = f"zenml-env-{app.name.replace('-', '_')}" - env_secret = modal.Secret.from_dict( - environment, name=secret_name - ) + env_secret = modal.Secret.from_dict(environment) secrets.append(env_secret) sb = await modal.Sandbox.create.aio( From 9a6daafb4d5066531437935459fbef061121afd5 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 11 Jul 2025 22:20:49 +0200 Subject: [PATCH 43/77] Add environment variables as command prefix in Modal orchestrator --- .../modal/orchestrators/modal_orchestrator.py | 19 +++---- .../modal_orchestrator_entrypoint.py | 56 +++++++++++-------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 83dcbed2d68..d01fa4dd2fc 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -229,7 +229,14 @@ def prepare_or_run_pipeline( run_id=placeholder_run.id if placeholder_run else None, ) ) - entrypoint_command = command + args + + # Add environment variables as command prefix + env_prefix = [] + if environment: + for key, value in environment.items(): + env_prefix.extend([f"{key}={value}"]) + + entrypoint_command = ["env"] + env_prefix + command + args # Execute using sandbox try: @@ -248,7 +255,6 @@ def prepare_or_run_pipeline( cloud=settings.cloud or self.config.cloud, region=settings.region or self.config.region, timeout=settings.timeout or self.config.timeout, - environment=environment, synchronous=settings.synchronous if hasattr(settings, "synchronous") else self.config.synchronous, @@ -276,7 +282,6 @@ async def _execute_pipeline_sandbox( cloud: Optional[str] = None, region: Optional[str] = None, timeout: int = 86400, - environment: Optional[Dict[str, str]] = None, synchronous: bool = True, ) -> None: """Execute pipeline using Modal sandbox. @@ -293,7 +298,6 @@ async def _execute_pipeline_sandbox( cloud: Cloud provider to use region: Region to deploy in timeout: Maximum execution timeout - environment: Environment variables for the sandbox synchronous: Whether to wait for completion """ logger.info(f"Using Modal app: {app.name}") @@ -311,12 +315,6 @@ async def _execute_pipeline_sandbox( with modal.enable_output(): # Create sandbox with the entrypoint command - # Use secrets for environment variables - secrets = [] - if environment: - env_secret = modal.Secret.from_dict(environment) - secrets.append(env_secret) - sb = await modal.Sandbox.create.aio( *entrypoint_command, # Pass as separate arguments to avoid shell quoting issues image=zenml_image, @@ -327,7 +325,6 @@ async def _execute_pipeline_sandbox( region=region, app=app, timeout=timeout, - secrets=secrets, ) # Set tags on the sandbox for organization diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 3bd99ad0d46..e8a79d04a14 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -41,9 +41,9 @@ ) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, - build_modal_image, generate_sandbox_tags, get_gpu_values, + get_or_build_modal_image, get_resource_values, setup_modal_client, ) @@ -107,34 +107,53 @@ def run_step_on_modal( # Get resource settings for this step resource_settings = step_config.settings.get(RESOURCE_SETTINGS_KEY) - # Build Modal image for this step + # Configure resources + gpu_values = get_gpu_values(step_settings_copy.gpu, resource_settings) + cpu_count, memory_mb = get_resource_values(resource_settings) + + # Create app name for this step + pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") + app_name = f"zenml-pipeline-{pipeline_name}" + + # Create Modal app for caching and execution + app = modal.App.lookup( + app_name, + create_if_missing=True, + environment_name=step_settings_copy.modal_environment, + ) + + # Get or build Modal image for this step with caching client = Client() active_stack = client.active_stack image_name = ModalOrchestrator.get_image( deployment=deployment, step_name=step_name ) - zenml_image = build_modal_image(image_name, active_stack, environment) - - # Configure resources - gpu_values = get_gpu_values(step_settings_copy.gpu, resource_settings) - cpu_count, memory_mb = get_resource_values(resource_settings) + zenml_image = get_or_build_modal_image( + image_name=image_name, + stack=active_stack, + deployment_id=str(deployment.id), + app=app, + ) # Create step entrypoint command step_command = StepEntrypointConfiguration.get_entrypoint_command() step_args = StepEntrypointConfiguration.get_entrypoint_arguments( step_name=step_name, deployment_id=deployment.id ) - entrypoint_command = step_command + step_args - # Create app name for this step - pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") - app_name = f"zenml-pipeline-{pipeline_name}" + # Add environment variables as command prefix + env_prefix = [] + if environment: + for key, value in environment.items(): + env_prefix.extend([f"{key}={value}"]) + + entrypoint_command = ["env"] + env_prefix + step_command + step_args # Execute step in sandbox try: asyncio.run( _execute_step_sandbox( - app_name=app_name, + app=app, step_name=step_name, zenml_image=zenml_image, entrypoint_command=entrypoint_command, @@ -145,7 +164,6 @@ def run_step_on_modal( cloud=step_settings_copy.cloud, region=step_settings_copy.region, timeout=step_settings_copy.timeout, - environment_name=step_settings_copy.modal_environment, ) ) logger.info(f"Step {step_name} completed successfully") @@ -155,7 +173,7 @@ def run_step_on_modal( async def _execute_step_sandbox( - app_name: str, + app: Any, step_name: str, zenml_image: Any, entrypoint_command: List[str], @@ -166,12 +184,11 @@ async def _execute_step_sandbox( cloud: Optional[str] = None, region: Optional[str] = None, timeout: int = 86400, - environment_name: Optional[str] = None, ) -> None: """Execute a single step using Modal sandbox. Args: - app_name: Name of the Modal app + app: Modal app instance step_name: Name of the step zenml_image: Pre-built ZenML Docker image for Modal entrypoint_command: Command to execute in the sandbox @@ -182,19 +199,14 @@ async def _execute_step_sandbox( cloud: Cloud provider to use region: Region to deploy in timeout: Maximum execution timeout - environment_name: Modal environment name """ - # Get or create persistent app - app = modal.App.lookup( - app_name, create_if_missing=True, environment_name=environment_name - ) - logger.info(f"Creating sandbox for step {step_name}") # Generate tags for the step sandbox step_tags = generate_sandbox_tags( pipeline_name=deployment.pipeline_configuration.name, deployment_id=str(deployment.id), + execution_mode="PER_STEP", step_name=step_name, ) logger.info(f"Step sandbox tags: {step_tags}") From 0b5311afe18bacffe6ef2b9e8a2acd6bc7eb0e8e Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 11 Jul 2025 22:31:19 +0200 Subject: [PATCH 44/77] Implement Modal sandbox executor for ZenML orchestration --- .../modal/orchestrators/modal_orchestrator.py | 188 ++--------- .../modal_orchestrator_entrypoint.py | 176 +--------- .../orchestrators/modal_sandbox_executor.py | 319 ++++++++++++++++++ 3 files changed, 359 insertions(+), 324 deletions(-) create mode 100644 src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index d01fa4dd2fc..0fca2699c1b 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -17,34 +17,24 @@ import os from typing import ( TYPE_CHECKING, - Any, Dict, Iterator, - List, Optional, Type, cast, ) from uuid import uuid4 -import modal - from zenml.config.base_settings import BaseSettings -from zenml.config.constants import RESOURCE_SETTINGS_KEY from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( ModalExecutionMode, ) -from zenml.integrations.modal.orchestrators.modal_orchestrator_entrypoint_configuration import ( - ModalOrchestratorEntrypointConfiguration, +from zenml.integrations.modal.orchestrators.modal_sandbox_executor import ( + ModalSandboxExecutor, ) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, create_modal_stack_validator, - generate_sandbox_tags, - get_gpu_values, - get_or_build_modal_image, - get_resource_settings_from_deployment, - get_resource_values, setup_modal_client, ) from zenml.logger import get_logger @@ -135,14 +125,14 @@ def prepare_or_run_pipeline( environment: Dict[str, str], placeholder_run: Optional["PipelineRunResponse"] = None, ) -> Optional[Iterator[Dict[str, MetadataType]]]: - """Runs the complete pipeline in a single Modal function. + """Runs the complete pipeline using Modal sandboxes. Args: deployment: The pipeline deployment to prepare or run. stack: The stack the pipeline will run on. environment: Environment variables to set in the orchestration environment. - placeholder_run: An optional placeholder run for the deployment (unused). + placeholder_run: An optional placeholder run for the deployment. Raises: Exception: If pipeline execution fails. @@ -152,7 +142,7 @@ def prepare_or_run_pipeline( """ if deployment.schedule: logger.warning( - "Serverless Orchestrator currently does not support the " + "Modal Orchestrator currently does not support the " "use of schedules. The `schedule` will be ignored " "and the pipeline will be run immediately." ) @@ -162,17 +152,16 @@ def prepare_or_run_pipeline( # Generate orchestrator run ID and include pipeline run ID for isolation orchestrator_run_id = str(uuid4()) - environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id - # Pass pipeline run ID for proper isolation (following other orchestrators' pattern) + # Pass pipeline run ID for proper isolation if placeholder_run: environment["ZENML_PIPELINE_RUN_ID"] = str(placeholder_run.id) logger.debug(f"Pipeline run ID: {placeholder_run.id}") logger.debug(f"Orchestrator run ID: {orchestrator_run_id}") - # Get settings from pipeline configuration (applies to entire pipeline) + # Get settings from pipeline configuration settings = cast( "ModalOrchestratorSettings", self.get_settings(deployment) ) @@ -183,81 +172,29 @@ def prepare_or_run_pipeline( ) logger.info(f"Using execution mode: {execution_mode}") - # Get resource settings from pipeline configuration - resource_settings = get_resource_settings_from_deployment( - deployment, RESOURCE_SETTINGS_KEY - ) - - # Configure resources from resource settings - gpu_values = get_gpu_values(settings.gpu, resource_settings) - cpu_count, memory_mb = get_resource_values(resource_settings) - - # Execute pipeline using Modal sandboxes for maximum flexibility - logger.info("Starting pipeline execution with Modal sandboxes") - - # SANDBOX ARCHITECTURE: Simple persistent app per pipeline - pipeline_name = deployment.pipeline_configuration.name.replace( - "_", "-" - ) - app_name = f"zenml-pipeline-{pipeline_name}" - - # Create Modal app for caching and execution - app = modal.App.lookup( - app_name, - create_if_missing=True, - environment_name=settings.modal_environment - or self.config.modal_environment, - ) - - # Get or build Modal image with caching based on deployment ID - image_name = self.get_image(deployment=deployment) - zenml_image = get_or_build_modal_image( - image_name=image_name, + # Create sandbox executor + executor = ModalSandboxExecutor( + deployment=deployment, stack=stack, - deployment_id=str(deployment.id), - app=app, - ) - - # Build entrypoint command and args for the orchestrator sandbox - command = ( - ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() - ) - args = ( - ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( - deployment_id=deployment.id, - orchestrator_run_id=orchestrator_run_id, - run_id=placeholder_run.id if placeholder_run else None, - ) + environment=environment, + settings=settings, ) - # Add environment variables as command prefix - env_prefix = [] - if environment: - for key, value in environment.items(): - env_prefix.extend([f"{key}={value}"]) - - entrypoint_command = ["env"] + env_prefix + command + args + # Execute pipeline using the executor + logger.info("Starting pipeline execution with Modal sandboxes") - # Execute using sandbox try: + synchronous = ( + settings.synchronous + if hasattr(settings, "synchronous") + else self.config.synchronous + ) + asyncio.run( - self._execute_pipeline_sandbox( - app=app, - zenml_image=zenml_image, - entrypoint_command=entrypoint_command, - deployment=deployment, - run_id=str(placeholder_run.id) - if placeholder_run - else None, - gpu_values=gpu_values, - cpu_count=cpu_count, - memory_mb=memory_mb, - cloud=settings.cloud or self.config.cloud, - region=settings.region or self.config.region, - timeout=settings.timeout or self.config.timeout, - synchronous=settings.synchronous - if hasattr(settings, "synchronous") - else self.config.synchronous, + executor.execute_pipeline( + orchestrator_run_id=orchestrator_run_id, + run_id=str(placeholder_run.id) if placeholder_run else None, + synchronous=synchronous, ) ) except Exception as e: @@ -266,83 +203,6 @@ def prepare_or_run_pipeline( raise logger.info("Pipeline execution completed successfully") - return None - async def _execute_pipeline_sandbox( - self, - app: Any, - zenml_image: Any, - entrypoint_command: List[str], - deployment: "PipelineDeploymentResponse", - run_id: Optional[str] = None, - gpu_values: Optional[str] = None, - cpu_count: Optional[int] = None, - memory_mb: Optional[int] = None, - cloud: Optional[str] = None, - region: Optional[str] = None, - timeout: int = 86400, - synchronous: bool = True, - ) -> None: - """Execute pipeline using Modal sandbox. - - Args: - app: Modal app instance - zenml_image: Pre-built ZenML Docker image for Modal - entrypoint_command: Command to execute in the sandbox - deployment: Pipeline deployment for tagging - run_id: Pipeline run ID for tagging - gpu_values: GPU configuration string - cpu_count: Number of CPU cores - memory_mb: Memory allocation in MB - cloud: Cloud provider to use - region: Region to deploy in - timeout: Maximum execution timeout - synchronous: Whether to wait for completion - """ - logger.info(f"Using Modal app: {app.name}") - - logger.info("Creating sandbox for pipeline execution") - # Generate tags for the sandbox - sandbox_tags = generate_sandbox_tags( - pipeline_name=deployment.pipeline_configuration.name, - deployment_id=str(deployment.id), - execution_mode="PIPELINE", - run_id=run_id, - ) - logger.info(f"Sandbox tags: {sandbox_tags}") - - with modal.enable_output(): - # Create sandbox with the entrypoint command - sb = await modal.Sandbox.create.aio( - *entrypoint_command, # Pass as separate arguments to avoid shell quoting issues - image=zenml_image, - gpu=gpu_values, - cpu=cpu_count, - memory=memory_mb, - cloud=cloud, - region=region, - app=app, - timeout=timeout, - ) - - # Set tags on the sandbox for organization - sb.set_tags(sandbox_tags) - - logger.info("Sandbox created, executing pipeline...") - - if synchronous: - # Stream output while waiting for completion - logger.info("Streaming pipeline execution logs...") - async for line in sb.stdout: - # Stream logs to stdout with proper formatting - print(line, end="") - - # Ensure completion - await sb.wait.aio() - logger.info("Pipeline execution completed") - else: - logger.info( - "Pipeline started asynchronously (not waiting for completion)" - ) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index e8a79d04a14..2810928fe31 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -16,20 +16,13 @@ import argparse import asyncio import os -from copy import deepcopy -from typing import Any, Dict, List, Optional, cast +from typing import Any, Dict, Optional, cast from uuid import UUID -import modal - from zenml.client import Client -from zenml.config.constants import RESOURCE_SETTINGS_KEY from zenml.entrypoints.pipeline_entrypoint_configuration import ( PipelineEntrypointConfiguration, ) -from zenml.entrypoints.step_entrypoint_configuration import ( - StepEntrypointConfiguration, -) from zenml.enums import ExecutionStatus from zenml.exceptions import AuthorizationException from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( @@ -39,12 +32,11 @@ from zenml.integrations.modal.orchestrators.modal_orchestrator import ( ModalOrchestrator, ) +from zenml.integrations.modal.orchestrators.modal_sandbox_executor import ( + ModalSandboxExecutor, +) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, - generate_sandbox_tags, - get_gpu_values, - get_or_build_modal_image, - get_resource_values, setup_modal_client, ) from zenml.logger import get_logger @@ -70,169 +62,27 @@ def parse_args() -> argparse.Namespace: def run_step_on_modal( step_name: str, - deployment: Any, - settings: ModalOrchestratorSettings, - environment: Dict[str, str], + executor: ModalSandboxExecutor, ) -> None: """Run a pipeline step in a separate Modal sandbox. Args: step_name: Name of the step. - deployment: The deployment configuration. - settings: Modal orchestrator settings. - environment: Environment variables. + executor: The Modal sandbox executor. Raises: Exception: If the sandbox fails to execute. """ logger.info(f"Running step '{step_name}' in Modal sandbox") - # Create a deep copy of pipeline-level settings to avoid modifying the original - step_settings_copy = deepcopy(settings) - - # Get step-specific settings if any - step_config = deployment.step_configurations[step_name].config - step_settings = step_config.settings.get("orchestrator.modal", None) - if step_settings: - step_modal_settings = ModalOrchestratorSettings.model_validate( - step_settings.model_dump() if step_settings else {} - ) - # Merge with pipeline-level settings copy - for key, value in step_modal_settings.model_dump( - exclude_unset=True - ).items(): - if value is not None: - setattr(step_settings_copy, key, value) - - # Get resource settings for this step - resource_settings = step_config.settings.get(RESOURCE_SETTINGS_KEY) - - # Configure resources - gpu_values = get_gpu_values(step_settings_copy.gpu, resource_settings) - cpu_count, memory_mb = get_resource_values(resource_settings) - - # Create app name for this step - pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") - app_name = f"zenml-pipeline-{pipeline_name}" - - # Create Modal app for caching and execution - app = modal.App.lookup( - app_name, - create_if_missing=True, - environment_name=step_settings_copy.modal_environment, - ) - - # Get or build Modal image for this step with caching - client = Client() - active_stack = client.active_stack - image_name = ModalOrchestrator.get_image( - deployment=deployment, step_name=step_name - ) - zenml_image = get_or_build_modal_image( - image_name=image_name, - stack=active_stack, - deployment_id=str(deployment.id), - app=app, - ) - - # Create step entrypoint command - step_command = StepEntrypointConfiguration.get_entrypoint_command() - step_args = StepEntrypointConfiguration.get_entrypoint_arguments( - step_name=step_name, deployment_id=deployment.id - ) - - # Add environment variables as command prefix - env_prefix = [] - if environment: - for key, value in environment.items(): - env_prefix.extend([f"{key}={value}"]) - - entrypoint_command = ["env"] + env_prefix + step_command + step_args - - # Execute step in sandbox try: - asyncio.run( - _execute_step_sandbox( - app=app, - step_name=step_name, - zenml_image=zenml_image, - entrypoint_command=entrypoint_command, - deployment=deployment, - gpu_values=gpu_values, - cpu_count=cpu_count, - memory_mb=memory_mb, - cloud=step_settings_copy.cloud, - region=step_settings_copy.region, - timeout=step_settings_copy.timeout, - ) - ) + asyncio.run(executor.execute_step(step_name)) logger.info(f"Step {step_name} completed successfully") except Exception as e: logger.error(f"Step {step_name} failed: {e}") raise -async def _execute_step_sandbox( - app: Any, - step_name: str, - zenml_image: Any, - entrypoint_command: List[str], - deployment: Any, - gpu_values: Optional[str] = None, - cpu_count: Optional[int] = None, - memory_mb: Optional[int] = None, - cloud: Optional[str] = None, - region: Optional[str] = None, - timeout: int = 86400, -) -> None: - """Execute a single step using Modal sandbox. - - Args: - app: Modal app instance - step_name: Name of the step - zenml_image: Pre-built ZenML Docker image for Modal - entrypoint_command: Command to execute in the sandbox - deployment: Pipeline deployment for tagging - gpu_values: GPU configuration string - cpu_count: Number of CPU cores - memory_mb: Memory allocation in MB - cloud: Cloud provider to use - region: Region to deploy in - timeout: Maximum execution timeout - """ - logger.info(f"Creating sandbox for step {step_name}") - - # Generate tags for the step sandbox - step_tags = generate_sandbox_tags( - pipeline_name=deployment.pipeline_configuration.name, - deployment_id=str(deployment.id), - execution_mode="PER_STEP", - step_name=step_name, - ) - logger.info(f"Step sandbox tags: {step_tags}") - - with modal.enable_output(): - # Create sandbox for this step - sb = await modal.Sandbox.create.aio( - *entrypoint_command, - image=zenml_image, - gpu=gpu_values, - cpu=cpu_count, - memory=memory_mb, - cloud=cloud, - region=region, - app=app, - timeout=timeout, - ) - - # Set tags on the step sandbox - sb.set_tags(step_tags) - - # Wait for step completion - await sb.wait.aio() - logger.info(f"Sandbox for step {step_name} completed") - - def finalize_run( node_states: Dict[str, NodeStatus], args: argparse.Namespace ) -> None: @@ -348,10 +198,16 @@ def main() -> None: # PER_STEP mode: Execute each step in separate sandbox logger.info("Executing pipeline with per-step sandboxes") + # Create executor for step execution + executor = ModalSandboxExecutor( + deployment=deployment, + stack=active_stack, + environment=environment, + settings=pipeline_settings, + ) + def run_step_wrapper(step_name: str) -> None: - run_step_on_modal( - step_name, deployment, pipeline_settings, environment - ) + run_step_on_modal(step_name, executor) def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: finalize_run(node_states, args) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py new file mode 100644 index 00000000000..f03ef4f5707 --- /dev/null +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -0,0 +1,319 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Modal sandbox executor for ZenML orchestration.""" + +import asyncio +from typing import Any, Dict, List, Optional, TYPE_CHECKING + +import modal + +from zenml.client import Client +from zenml.config.constants import RESOURCE_SETTINGS_KEY +from zenml.entrypoints.pipeline_entrypoint_configuration import ( + PipelineEntrypointConfiguration, +) +from zenml.entrypoints.step_entrypoint_configuration import ( + StepEntrypointConfiguration, +) +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorSettings, +) +from zenml.integrations.modal.orchestrators.modal_orchestrator_entrypoint_configuration import ( + ModalOrchestratorEntrypointConfiguration, +) +from zenml.integrations.modal.utils import ( + generate_sandbox_tags, + get_gpu_values, + get_or_build_modal_image, + get_resource_values, +) +from zenml.logger import get_logger + +if TYPE_CHECKING: + from zenml.models import PipelineDeploymentResponse + from zenml.stack import Stack + +logger = get_logger(__name__) + + +class ModalSandboxExecutor: + """Handles execution of ZenML pipelines and steps in Modal sandboxes.""" + + def __init__( + self, + deployment: "PipelineDeploymentResponse", + stack: "Stack", + environment: Dict[str, str], + settings: ModalOrchestratorSettings, + ): + """Initialize the Modal sandbox executor. + + Args: + deployment: The pipeline deployment. + stack: The ZenML stack. + environment: Environment variables. + settings: Modal orchestrator settings. + """ + self.deployment = deployment + self.stack = stack + self.environment = environment + self.settings = settings + self.client = Client() + + # Create Modal app for this pipeline + pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") + self.app_name = f"zenml-pipeline-{pipeline_name}" + self.app = modal.App.lookup( + self.app_name, + create_if_missing=True, + environment_name=settings.modal_environment, + ) + + def _build_entrypoint_command( + self, + base_command: List[str], + args: List[str] + ) -> List[str]: + """Build the complete entrypoint command with environment variables. + + Args: + base_command: Base command to execute. + args: Arguments for the command. + + Returns: + Complete command with environment variables. + """ + env_prefix = [] + if self.environment: + for key, value in self.environment.items(): + env_prefix.extend([f"{key}={value}"]) + + return ["env"] + env_prefix + base_command + args + + def _get_step_settings(self, step_name: str) -> ModalOrchestratorSettings: + """Get merged settings for a specific step. + + Args: + step_name: Name of the step. + + Returns: + Merged Modal orchestrator settings. + """ + # Start with pipeline-level settings + merged_settings = ModalOrchestratorSettings.model_validate( + self.settings.model_dump() + ) + + # Get step-specific settings + step_config = self.deployment.step_configurations[step_name].config + step_settings = step_config.settings.get("orchestrator.modal") + + if step_settings: + step_modal_settings = ModalOrchestratorSettings.model_validate( + step_settings.model_dump() + ) + # Merge step settings over pipeline settings + for key, value in step_modal_settings.model_dump(exclude_unset=True).items(): + if value is not None: + setattr(merged_settings, key, value) + + return merged_settings + + def _get_resource_config( + self, + step_name: Optional[str] = None + ) -> tuple[Optional[str], Optional[int], Optional[int]]: + """Get resource configuration for pipeline or step. + + Args: + step_name: Name of the step (None for pipeline-level). + + Returns: + Tuple of (gpu_values, cpu_count, memory_mb). + """ + if step_name: + step_settings = self._get_step_settings(step_name) + step_config = self.deployment.step_configurations[step_name].config + resource_settings = step_config.settings.get(RESOURCE_SETTINGS_KEY) + gpu_values = get_gpu_values(step_settings.gpu, resource_settings) + else: + # Pipeline-level resource settings + resource_settings = None + gpu_values = get_gpu_values(self.settings.gpu, resource_settings) + + cpu_count, memory_mb = get_resource_values(resource_settings) + return gpu_values, cpu_count, memory_mb + + async def _execute_sandbox( + self, + entrypoint_command: List[str], + execution_mode: str, + step_name: Optional[str] = None, + run_id: Optional[str] = None, + synchronous: bool = True, + ) -> None: + """Execute a sandbox with the given command. + + Args: + entrypoint_command: Command to execute in the sandbox. + execution_mode: Execution mode for tagging. + step_name: Name of the step (for step execution). + run_id: Pipeline run ID for tagging. + synchronous: Whether to wait for completion. + """ + # Get resource configuration + gpu_values, cpu_count, memory_mb = self._get_resource_config(step_name) + + # Get or build Modal image + image_name = self._get_image_name(step_name) + zenml_image = get_or_build_modal_image( + image_name=image_name, + stack=self.stack, + deployment_id=str(self.deployment.id), + app=self.app, + ) + + # Generate tags + tags = generate_sandbox_tags( + pipeline_name=self.deployment.pipeline_configuration.name, + deployment_id=str(self.deployment.id), + execution_mode=execution_mode, + step_name=step_name, + run_id=run_id, + ) + + logger.info(f"Creating sandbox for {execution_mode.lower()} execution") + logger.info(f"Sandbox tags: {tags}") + + with modal.enable_output(): + # Create sandbox + sb = await modal.Sandbox.create.aio( + *entrypoint_command, + image=zenml_image, + gpu=gpu_values, + cpu=cpu_count, + memory=memory_mb, + cloud=self.settings.cloud, + region=self.settings.region, + app=self.app, + timeout=self.settings.timeout, + ) + + # Set tags + sb.set_tags(tags) + + if synchronous: + # Stream output for better user experience + async for line in sb.stdout: + print(line, end="") + await sb.wait.aio() + else: + logger.info("Sandbox started asynchronously") + + def _get_image_name(self, step_name: Optional[str] = None) -> str: + """Get the image name for the pipeline or step. + + Args: + step_name: Name of the step (None for pipeline-level). + + Returns: + Image name to use. + """ + if step_name: + from zenml.integrations.modal.orchestrators.modal_orchestrator import ( + ModalOrchestrator, + ) + return ModalOrchestrator.get_image( + deployment=self.deployment, step_name=step_name + ) + else: + from zenml.integrations.modal.orchestrators.modal_orchestrator import ( + ModalOrchestrator, + ) + return ModalOrchestrator.get_image(deployment=self.deployment) + + async def execute_pipeline( + self, + orchestrator_run_id: str, + run_id: Optional[str] = None, + synchronous: bool = True, + ) -> None: + """Execute the entire pipeline in a single sandbox. + + Args: + orchestrator_run_id: The orchestrator run ID. + run_id: The pipeline run ID. + synchronous: Whether to wait for completion. + """ + logger.info("Executing entire pipeline in single sandbox") + + # Build entrypoint command + command = ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() + args = ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( + deployment_id=self.deployment.id, + orchestrator_run_id=orchestrator_run_id, + run_id=run_id, + ) + entrypoint_command = self._build_entrypoint_command(command, args) + + # Execute pipeline sandbox + await self._execute_sandbox( + entrypoint_command=entrypoint_command, + execution_mode="PIPELINE", + run_id=run_id, + synchronous=synchronous, + ) + + async def execute_step(self, step_name: str) -> None: + """Execute a single step in its own sandbox. + + Args: + step_name: Name of the step to execute. + """ + logger.info(f"Executing step '{step_name}' in separate sandbox") + + # Build step entrypoint command + command = StepEntrypointConfiguration.get_entrypoint_command() + args = StepEntrypointConfiguration.get_entrypoint_arguments( + step_name=step_name, deployment_id=self.deployment.id + ) + entrypoint_command = self._build_entrypoint_command(command, args) + + # Execute step sandbox + await self._execute_sandbox( + entrypoint_command=entrypoint_command, + execution_mode="PER_STEP", + step_name=step_name, + synchronous=True, # Steps are always synchronous + ) + + def execute_pipeline_sync( + self, + orchestrator_run_id: str, + run_id: Optional[str] = None, + synchronous: bool = True, + ) -> None: + """Execute the entire pipeline synchronously. + + Args: + orchestrator_run_id: The orchestrator run ID. + run_id: The pipeline run ID. + synchronous: Whether to wait for completion. + """ + # Execute entire pipeline in this sandbox (for PIPELINE mode) + entrypoint_args = PipelineEntrypointConfiguration.get_entrypoint_arguments( + deployment_id=self.deployment.id + ) + config = PipelineEntrypointConfiguration(arguments=entrypoint_args) + config.run() \ No newline at end of file From 5f4c207751dbdd02e8c704787614f3e4ad536999 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 11 Jul 2025 22:35:15 +0200 Subject: [PATCH 45/77] Refactor execution methods for better clarity --- .../modal/orchestrators/modal_orchestrator.py | 8 +- .../modal_orchestrator_entrypoint.py | 2 +- .../orchestrators/modal_sandbox_executor.py | 102 +++++++++--------- 3 files changed, 53 insertions(+), 59 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 0fca2699c1b..159e78b5198 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -189,11 +189,13 @@ def prepare_or_run_pipeline( if hasattr(settings, "synchronous") else self.config.synchronous ) - + asyncio.run( executor.execute_pipeline( orchestrator_run_id=orchestrator_run_id, - run_id=str(placeholder_run.id) if placeholder_run else None, + run_id=str(placeholder_run.id) + if placeholder_run + else None, synchronous=synchronous, ) ) @@ -204,5 +206,3 @@ def prepare_or_run_pipeline( logger.info("Pipeline execution completed successfully") return None - - diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 2810928fe31..ec2689b63e0 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -16,7 +16,7 @@ import argparse import asyncio import os -from typing import Any, Dict, Optional, cast +from typing import Any, Dict, cast from uuid import UUID from zenml.client import Client diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index f03ef4f5707..9c1a13e7e99 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -13,16 +13,12 @@ # permissions and limitations under the License. """Modal sandbox executor for ZenML orchestration.""" -import asyncio -from typing import Any, Dict, List, Optional, TYPE_CHECKING +from typing import TYPE_CHECKING, Dict, List, Optional import modal from zenml.client import Client from zenml.config.constants import RESOURCE_SETTINGS_KEY -from zenml.entrypoints.pipeline_entrypoint_configuration import ( - PipelineEntrypointConfiguration, -) from zenml.entrypoints.step_entrypoint_configuration import ( StepEntrypointConfiguration, ) @@ -36,6 +32,7 @@ generate_sandbox_tags, get_gpu_values, get_or_build_modal_image, + get_resource_settings_from_deployment, get_resource_values, ) from zenml.logger import get_logger @@ -70,9 +67,11 @@ def __init__( self.environment = environment self.settings = settings self.client = Client() - + # Create Modal app for this pipeline - pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") + pipeline_name = deployment.pipeline_configuration.name.replace( + "_", "-" + ) self.app_name = f"zenml-pipeline-{pipeline_name}" self.app = modal.App.lookup( self.app_name, @@ -81,9 +80,7 @@ def __init__( ) def _build_entrypoint_command( - self, - base_command: List[str], - args: List[str] + self, base_command: List[str], args: List[str] ) -> List[str]: """Build the complete entrypoint command with environment variables. @@ -114,25 +111,26 @@ def _get_step_settings(self, step_name: str) -> ModalOrchestratorSettings: merged_settings = ModalOrchestratorSettings.model_validate( self.settings.model_dump() ) - + # Get step-specific settings step_config = self.deployment.step_configurations[step_name].config step_settings = step_config.settings.get("orchestrator.modal") - + if step_settings: step_modal_settings = ModalOrchestratorSettings.model_validate( step_settings.model_dump() ) # Merge step settings over pipeline settings - for key, value in step_modal_settings.model_dump(exclude_unset=True).items(): + for key, value in step_modal_settings.model_dump( + exclude_unset=True + ).items(): if value is not None: setattr(merged_settings, key, value) - + return merged_settings def _get_resource_config( - self, - step_name: Optional[str] = None + self, step_name: Optional[str] = None ) -> tuple[Optional[str], Optional[int], Optional[int]]: """Get resource configuration for pipeline or step. @@ -149,7 +147,9 @@ def _get_resource_config( gpu_values = get_gpu_values(step_settings.gpu, resource_settings) else: # Pipeline-level resource settings - resource_settings = None + resource_settings = get_resource_settings_from_deployment( + self.deployment, RESOURCE_SETTINGS_KEY + ) gpu_values = get_gpu_values(self.settings.gpu, resource_settings) cpu_count, memory_mb = get_resource_values(resource_settings) @@ -175,6 +175,17 @@ async def _execute_sandbox( # Get resource configuration gpu_values, cpu_count, memory_mb = self._get_resource_config(step_name) + # Get settings (step-specific for steps, pipeline-level for pipeline) + if step_name: + step_settings = self._get_step_settings(step_name) + cloud = step_settings.cloud + region = step_settings.region + timeout = step_settings.timeout + else: + cloud = self.settings.cloud + region = self.settings.region + timeout = self.settings.timeout + # Get or build Modal image image_name = self._get_image_name(step_name) zenml_image = get_or_build_modal_image( @@ -204,10 +215,10 @@ async def _execute_sandbox( gpu=gpu_values, cpu=cpu_count, memory=memory_mb, - cloud=self.settings.cloud, - region=self.settings.region, + cloud=cloud, + region=region, app=self.app, - timeout=self.settings.timeout, + timeout=timeout, ) # Set tags @@ -230,21 +241,20 @@ def _get_image_name(self, step_name: Optional[str] = None) -> str: Returns: Image name to use. """ + # Import here to avoid circular imports + from zenml.integrations.modal.orchestrators.modal_orchestrator import ( + ModalOrchestrator, + ) + if step_name: - from zenml.integrations.modal.orchestrators.modal_orchestrator import ( - ModalOrchestrator, - ) return ModalOrchestrator.get_image( deployment=self.deployment, step_name=step_name ) else: - from zenml.integrations.modal.orchestrators.modal_orchestrator import ( - ModalOrchestrator, - ) return ModalOrchestrator.get_image(deployment=self.deployment) async def execute_pipeline( - self, + self, orchestrator_run_id: str, run_id: Optional[str] = None, synchronous: bool = True, @@ -257,13 +267,17 @@ async def execute_pipeline( synchronous: Whether to wait for completion. """ logger.info("Executing entire pipeline in single sandbox") - + # Build entrypoint command - command = ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() - args = ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( - deployment_id=self.deployment.id, - orchestrator_run_id=orchestrator_run_id, - run_id=run_id, + command = ( + ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() + ) + args = ( + ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( + deployment_id=self.deployment.id, + orchestrator_run_id=orchestrator_run_id, + run_id=run_id, + ) ) entrypoint_command = self._build_entrypoint_command(command, args) @@ -282,7 +296,7 @@ async def execute_step(self, step_name: str) -> None: step_name: Name of the step to execute. """ logger.info(f"Executing step '{step_name}' in separate sandbox") - + # Build step entrypoint command command = StepEntrypointConfiguration.get_entrypoint_command() args = StepEntrypointConfiguration.get_entrypoint_arguments( @@ -297,23 +311,3 @@ async def execute_step(self, step_name: str) -> None: step_name=step_name, synchronous=True, # Steps are always synchronous ) - - def execute_pipeline_sync( - self, - orchestrator_run_id: str, - run_id: Optional[str] = None, - synchronous: bool = True, - ) -> None: - """Execute the entire pipeline synchronously. - - Args: - orchestrator_run_id: The orchestrator run ID. - run_id: The pipeline run ID. - synchronous: Whether to wait for completion. - """ - # Execute entire pipeline in this sandbox (for PIPELINE mode) - entrypoint_args = PipelineEntrypointConfiguration.get_entrypoint_arguments( - deployment_id=self.deployment.id - ) - config = PipelineEntrypointConfiguration(arguments=entrypoint_args) - config.run() \ No newline at end of file From 623c5fb6c0fe1b1aa5c1bc501382ed4e88d29050 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 11 Jul 2025 22:53:58 +0200 Subject: [PATCH 46/77] Refactor entrypoint command creation for sandbox executor --- .../orchestrators/modal_sandbox_executor.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 9c1a13e7e99..656e8947168 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -13,7 +13,7 @@ # permissions and limitations under the License. """Modal sandbox executor for ZenML orchestration.""" -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional import modal @@ -82,21 +82,18 @@ def __init__( def _build_entrypoint_command( self, base_command: List[str], args: List[str] ) -> List[str]: - """Build the complete entrypoint command with environment variables. + """Build the complete entrypoint command (without environment variables). + + Environment variables are now passed via secrets parameter to sandbox. Args: base_command: Base command to execute. args: Arguments for the command. Returns: - Complete command with environment variables. + Complete command without environment variables. """ - env_prefix = [] - if self.environment: - for key, value in self.environment.items(): - env_prefix.extend([f"{key}={value}"]) - - return ["env"] + env_prefix + base_command + args + return base_command + args def _get_step_settings(self, step_name: str) -> ModalOrchestratorSettings: """Get merged settings for a specific step. @@ -129,6 +126,19 @@ def _get_step_settings(self, step_name: str) -> ModalOrchestratorSettings: return merged_settings + def _create_environment_secret(self) -> Optional[Any]: + """Create a Modal secret containing environment variables. + + Returns: + Modal secret with environment variables, or None if no env vars. + """ + if not self.environment: + return None + + # Create secret from environment variables + # Modal handles efficiency internally + return modal.Secret.from_dict(self.environment) + def _get_resource_config( self, step_name: Optional[str] = None ) -> tuple[Optional[str], Optional[int], Optional[int]]: @@ -195,6 +205,10 @@ async def _execute_sandbox( app=self.app, ) + # Create environment secret + env_secret = self._create_environment_secret() + secrets = [env_secret] if env_secret else [] + # Generate tags tags = generate_sandbox_tags( pipeline_name=self.deployment.pipeline_configuration.name, @@ -208,7 +222,7 @@ async def _execute_sandbox( logger.info(f"Sandbox tags: {tags}") with modal.enable_output(): - # Create sandbox + # Create sandbox with environment variables passed as secrets sb = await modal.Sandbox.create.aio( *entrypoint_command, image=zenml_image, @@ -219,6 +233,7 @@ async def _execute_sandbox( region=region, app=self.app, timeout=timeout, + secrets=secrets, ) # Set tags From 27af8a3740c7cb7f4e7c8a568b609751118860aa Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 11 Jul 2025 23:09:36 +0200 Subject: [PATCH 47/77] Remove redundant logging message in orchestrator class --- src/zenml/integrations/modal/orchestrators/modal_orchestrator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 159e78b5198..351f35f6060 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -201,7 +201,6 @@ def prepare_or_run_pipeline( ) except Exception as e: logger.error(f"Pipeline execution failed: {e}") - logger.info("Check Modal dashboard for detailed logs") raise logger.info("Pipeline execution completed successfully") From 0b37850e66dc8ae3682e1180e0080ad47064820c Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 11 Jul 2025 23:32:53 +0200 Subject: [PATCH 48/77] Refactor ModalSandboxExecutor for step-specific settings --- .../orchestrators/modal_sandbox_executor.py | 65 ++++++++++++------- src/zenml/integrations/modal/utils.py | 7 +- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 656e8947168..6674c482150 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -19,6 +19,7 @@ from zenml.client import Client from zenml.config.constants import RESOURCE_SETTINGS_KEY +from zenml.config.resource_settings import ResourceSettings from zenml.entrypoints.step_entrypoint_configuration import ( StepEntrypointConfiguration, ) @@ -105,25 +106,35 @@ def _get_step_settings(self, step_name: str) -> ModalOrchestratorSettings: Merged Modal orchestrator settings. """ # Start with pipeline-level settings - merged_settings = ModalOrchestratorSettings.model_validate( - self.settings.model_dump() - ) + pipeline_settings_dict = self.settings.model_dump() # Get step-specific settings - step_config = self.deployment.step_configurations[step_name].config - step_settings = step_config.settings.get("orchestrator.modal") - - if step_settings: - step_modal_settings = ModalOrchestratorSettings.model_validate( - step_settings.model_dump() - ) - # Merge step settings over pipeline settings - for key, value in step_modal_settings.model_dump( - exclude_unset=True - ).items(): - if value is not None: - setattr(merged_settings, key, value) - + if step_name in self.deployment.step_configurations: + step_config = self.deployment.step_configurations[step_name].config + step_settings = step_config.settings.get("orchestrator.modal") + + if step_settings: + # Handle both dict and Pydantic model cases + if hasattr(step_settings, "model_dump"): + step_settings_data = step_settings.model_dump() + else: + step_settings_data = step_settings + + step_modal_settings = ModalOrchestratorSettings.model_validate( + step_settings_data + ) + # Merge step settings over pipeline settings + step_settings_dict = step_modal_settings.model_dump( + exclude_unset=True + ) + for key, value in step_settings_dict.items(): + if value is not None: + pipeline_settings_dict[key] = value + + # Create merged settings from the combined dictionary + merged_settings = ModalOrchestratorSettings.model_validate( + pipeline_settings_dict + ) return merged_settings def _create_environment_secret(self) -> Optional[Any]: @@ -137,7 +148,11 @@ def _create_environment_secret(self) -> Optional[Any]: # Create secret from environment variables # Modal handles efficiency internally - return modal.Secret.from_dict(self.environment) + # Cast to Dict[str, str | None] to match Modal's expected type + env_dict: Dict[str, Optional[str]] = { + k: v for k, v in self.environment.items() + } + return modal.Secret.from_dict(env_dict) def _get_resource_config( self, step_name: Optional[str] = None @@ -154,15 +169,21 @@ def _get_resource_config( step_settings = self._get_step_settings(step_name) step_config = self.deployment.step_configurations[step_name].config resource_settings = step_config.settings.get(RESOURCE_SETTINGS_KEY) - gpu_values = get_gpu_values(step_settings.gpu, resource_settings) + gpu_values = get_gpu_values( + step_settings.gpu, resource_settings or ResourceSettings() + ) else: # Pipeline-level resource settings resource_settings = get_resource_settings_from_deployment( self.deployment, RESOURCE_SETTINGS_KEY ) - gpu_values = get_gpu_values(self.settings.gpu, resource_settings) + gpu_values = get_gpu_values( + self.settings.gpu, resource_settings or ResourceSettings() + ) - cpu_count, memory_mb = get_resource_values(resource_settings) + cpu_count, memory_mb = get_resource_values( + resource_settings or ResourceSettings() + ) return gpu_values, cpu_count, memory_mb async def _execute_sandbox( @@ -291,7 +312,7 @@ async def execute_pipeline( ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( deployment_id=self.deployment.id, orchestrator_run_id=orchestrator_run_id, - run_id=run_id, + run_id=run_id or None, ) ) entrypoint_command = self._build_entrypoint_command(command, args) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 97bfade47db..62d5828eb00 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -225,7 +225,7 @@ def get_or_build_modal_image( try: # Try to look up existing image - existing_image = getattr(app, image_name_key, None) + existing_image = modal.Image.from_id(image_name_key) if existing_image is not None: logger.info( f"Using cached Modal image for deployment {deployment_id}" @@ -274,12 +274,14 @@ def get_or_build_modal_image( def build_modal_image( image_name: str, stack: "Stack", + environment: Optional[Dict[str, str]] = None, ) -> Any: """Build a Modal image from a ZenML-built Docker image. Args: image_name: The name of the Docker image to use as base. stack: The ZenML stack containing container registry. + environment: The environment variables to pass to the image. Returns: The configured Modal image. @@ -325,6 +327,9 @@ def build_modal_image( ).pip_install("modal") # Install Modal in the container ) + if environment: + zenml_image = zenml_image.env(environment) + return zenml_image From b5935f6bb64f667e601e1255a149ba71bee5f627 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 00:15:49 +0200 Subject: [PATCH 49/77] Refactor image caching logic for pipeline builds --- .../orchestrators/modal_sandbox_executor.py | 34 ++++++++++++++++++- src/zenml/integrations/modal/utils.py | 32 +++++++++++------ 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 6674c482150..8d4535f1d23 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -222,7 +222,8 @@ async def _execute_sandbox( zenml_image = get_or_build_modal_image( image_name=image_name, stack=self.stack, - deployment_id=str(self.deployment.id), + pipeline_name=self.deployment.pipeline_configuration.name, + build_id=str(self.deployment.build.id), app=self.app, ) @@ -268,6 +269,37 @@ async def _execute_sandbox( else: logger.info("Sandbox started asynchronously") + # Store the image ID for future caching after sandbox creation + # The image should be hydrated after being used in sandbox creation + await self._store_image_id(zenml_image) + + async def _store_image_id(self, zenml_image: Any) -> None: + """Store the image ID for future caching after sandbox creation. + + Args: + zenml_image: The Modal image that was used. + """ + try: + # After sandbox creation, the image should be hydrated + zenml_image.hydrate() + if hasattr(zenml_image, "object_id") and zenml_image.object_id: + image_name_key = f"zenml_image_{self.deployment.build.id}" + + # Store the image ID in Modal's persistent storage + pipeline_name = self.deployment.pipeline_configuration.name + stored_id = modal.Dict.from_name( + f"zenml-image-cache-{pipeline_name}", + create_if_missing=True, + ) + stored_id[image_name_key] = zenml_image.object_id + logger.info( + f"Stored Modal image ID for build {self.deployment.build.id}" + ) + else: + logger.warning("Image not hydrated after sandbox creation") + except Exception as e: + logger.warning(f"Failed to store image ID: {e}") + def _get_image_name(self, step_name: Optional[str] = None) -> str: """Get the image name for the pipeline or step. diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 62d5828eb00..d6f9b7d06eb 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -195,15 +195,17 @@ def get_resource_values( def get_or_build_modal_image( image_name: str, stack: "Stack", - deployment_id: str, + pipeline_name: str, + build_id: str, app: Any, ) -> Any: - """Get existing Modal image or build new one based on deployment ID. + """Get existing Modal image or build new one based on pipeline name and build ID. Args: image_name: The name of the Docker image to use as base. stack: The ZenML stack containing container registry. - deployment_id: The deployment ID for caching. + pipeline_name: The pipeline name for caching. + build_id: The build ID for the image key. app: The Modal app to store/retrieve images. Returns: @@ -221,16 +223,26 @@ def get_or_build_modal_image( ) # Try to get existing image from the app - image_name_key = f"zenml_image_{deployment_id}" + image_name_key = f"zenml_image_{build_id}" try: - # Try to look up existing image - existing_image = modal.Image.from_id(image_name_key) - if existing_image is not None: - logger.info( - f"Using cached Modal image for deployment {deployment_id}" + # Try to look up existing image by ID from Modal's object store + # We'll store the image ID as a Modal object for persistence + try: + # Try to get stored image ID + stored_id = modal.Dict.from_name( + f"zenml-image-cache-{pipeline_name}" ) - return existing_image + if image_name_key in stored_id: + image_id = stored_id[image_name_key] + existing_image = modal.Image.from_id(image_id) + logger.info( + f"Using cached Modal image for build {build_id} in pipeline {pipeline_name}" + ) + return existing_image + except Exception: + # Dict doesn't exist or image not found, will build new one + pass except Exception: # If lookup fails, we'll build a new image pass From 4be1fc77c4185b5e2ec2ff2b81515ba7c9fb8bd4 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 17:33:31 +0200 Subject: [PATCH 50/77] Add shared image cache for pipeline step execution --- .../modal_orchestrator_entrypoint.py | 208 ++++++++++++++---- .../orchestrators/modal_sandbox_executor.py | 99 +++++++-- 2 files changed, 246 insertions(+), 61 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index ec2689b63e0..572049f7b37 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -16,10 +16,16 @@ import argparse import asyncio import os -from typing import Any, Dict, cast +from typing import TYPE_CHECKING, Any, Dict, cast from uuid import UUID +import modal + from zenml.client import Client + +if TYPE_CHECKING: + from zenml.models import PipelineDeploymentResponse + from zenml.stack import Stack from zenml.entrypoints.pipeline_entrypoint_configuration import ( PipelineEntrypointConfiguration, ) @@ -83,6 +89,161 @@ def run_step_on_modal( raise +async def prepare_shared_image_cache( + deployment: "PipelineDeploymentResponse", + stack: "Stack", + settings: ModalOrchestratorSettings, +) -> tuple[Dict[str, Any], Any]: + """Pre-build all required images for pipeline steps and create shared Modal app. + + This function analyzes all steps in the deployment, identifies unique images + needed, and pre-builds them to avoid redundant builds during step execution. + + Args: + deployment: The pipeline deployment. + stack: The ZenML stack. + settings: Modal orchestrator settings. + + Returns: + Tuple of (shared_image_cache, shared_modal_app). + """ + from zenml.integrations.modal.orchestrators.modal_orchestrator import ( + ModalOrchestrator, + ) + from zenml.integrations.modal.utils import get_or_build_modal_image + + logger.info("Preparing shared image cache for per-step execution") + + # Create shared Modal app + pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") + app_name = f"zenml-pipeline-{pipeline_name}" + shared_app = modal.App.lookup( + app_name, + create_if_missing=True, + environment_name=settings.modal_environment, + ) + + image_cache: Dict[str, Any] = {} + + # Collect all unique images needed across all steps + unique_images: Dict[str, str] = {} # cache_key -> image_name + + # Add pipeline-level image if needed + pipeline_image = ModalOrchestrator.get_image(deployment=deployment) + build_id = str(deployment.build.id) + pipeline_cache_key = ( + f"{build_id}_pipeline_{str(hash(pipeline_image))[-8:]}" + ) + unique_images[pipeline_cache_key] = pipeline_image + + # Add step-specific images + for step_name in deployment.step_configurations: + step_image = ModalOrchestrator.get_image( + deployment=deployment, step_name=step_name + ) + image_hash = str(hash(step_image))[-8:] + step_cache_key = f"{build_id}_{step_name}_{image_hash}" + unique_images[step_cache_key] = step_image + + # Build all unique images + logger.info(f"Building {len(unique_images)} unique images for pipeline") + for cache_key, image_name in unique_images.items(): + logger.info(f"Building image: {cache_key} from {image_name}") + try: + built_image = get_or_build_modal_image( + image_name=image_name, + stack=stack, + pipeline_name=deployment.pipeline_configuration.name, + build_id=build_id, + app=shared_app, + ) + image_cache[cache_key] = built_image + logger.info(f"Successfully cached image: {cache_key}") + except Exception as e: + logger.error(f"Failed to build image {cache_key}: {e}") + raise + + logger.info(f"Image cache prepared with {len(image_cache)} images") + return image_cache, shared_app + + +def execute_pipeline_mode(args: argparse.Namespace) -> None: + """Execute entire pipeline in single sandbox mode. + + Args: + args: Parsed command line arguments. + """ + logger.info("Executing entire pipeline in single sandbox") + entrypoint_args = PipelineEntrypointConfiguration.get_entrypoint_arguments( + deployment_id=args.deployment_id + ) + config = PipelineEntrypointConfiguration(arguments=entrypoint_args) + config.run() + + +def execute_per_step_mode( + deployment: "PipelineDeploymentResponse", + active_stack: "Stack", + environment: Dict[str, str], + pipeline_settings: ModalOrchestratorSettings, + args: argparse.Namespace, +) -> None: + """Execute pipeline with per-step sandboxes. + + Args: + deployment: The pipeline deployment. + active_stack: The active ZenML stack. + environment: Environment variables. + pipeline_settings: Modal orchestrator settings. + args: Parsed command line arguments. + """ + logger.info("Executing pipeline with per-step sandboxes") + + # Prepare shared image cache and Modal app for all steps + logger.info("Pre-building images for step execution") + shared_image_cache, shared_app = asyncio.run( + prepare_shared_image_cache( + deployment=deployment, + stack=active_stack, + settings=pipeline_settings, + ) + ) + + # Create shared executor instance that will be reused across steps + shared_executor = ModalSandboxExecutor( + deployment=deployment, + stack=active_stack, + environment=environment, + settings=pipeline_settings, + shared_image_cache=shared_image_cache, + shared_app=shared_app, + ) + + def run_step_wrapper(step_name: str) -> None: + """Wrapper to execute step with shared resources.""" + run_step_on_modal(step_name, shared_executor) + + def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: + """Wrapper to finalize pipeline execution.""" + finalize_run(node_states, args) + + # Build DAG from deployment + pipeline_dag = { + step_name: step.spec.upstream_steps + for step_name, step in deployment.step_configurations.items() + } + + logger.info(f"Executing {len(pipeline_dag)} steps with shared image cache") + + # Run using ThreadedDagRunner with optimized execution + ThreadedDagRunner( + dag=pipeline_dag, + run_fn=run_step_wrapper, + finalize_fn=finalize_wrapper, + max_parallelism=getattr(pipeline_settings, "max_parallelism", None), + ).run() + + def finalize_run( node_states: Dict[str, NodeStatus], args: argparse.Namespace ) -> None: @@ -183,51 +344,14 @@ def main() -> None: ) try: + # Execute pipeline based on execution mode if execution_mode == ModalExecutionMode.PIPELINE: - # Execute entire pipeline in this sandbox - logger.info("Executing entire pipeline in single sandbox") - entrypoint_args = ( - PipelineEntrypointConfiguration.get_entrypoint_arguments( - deployment_id=args.deployment_id - ) - ) - config = PipelineEntrypointConfiguration(arguments=entrypoint_args) - config.run() - + execute_pipeline_mode(args) else: - # PER_STEP mode: Execute each step in separate sandbox - logger.info("Executing pipeline with per-step sandboxes") - - # Create executor for step execution - executor = ModalSandboxExecutor( - deployment=deployment, - stack=active_stack, - environment=environment, - settings=pipeline_settings, + execute_per_step_mode( + deployment, active_stack, environment, pipeline_settings, args ) - def run_step_wrapper(step_name: str) -> None: - run_step_on_modal(step_name, executor) - - def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: - finalize_run(node_states, args) - - # Build DAG from deployment - pipeline_dag = { - step_name: step.spec.upstream_steps - for step_name, step in deployment.step_configurations.items() - } - - # Run using ThreadedDagRunner - ThreadedDagRunner( - dag=pipeline_dag, - run_fn=run_step_wrapper, - finalize_fn=finalize_wrapper, - max_parallelism=getattr( - pipeline_settings, "max_parallelism", None - ), - ).run() - logger.info("Pipeline execution completed successfully") except Exception as e: diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 8d4535f1d23..849cf448a00 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -54,6 +54,8 @@ def __init__( stack: "Stack", environment: Dict[str, str], settings: ModalOrchestratorSettings, + shared_image_cache: Optional[Dict[str, Any]] = None, + shared_app: Optional[Any] = None, ): """Initialize the Modal sandbox executor. @@ -62,23 +64,31 @@ def __init__( stack: The ZenML stack. environment: Environment variables. settings: Modal orchestrator settings. + shared_image_cache: Pre-built images shared across step executions. + shared_app: Shared Modal app for the entire pipeline execution. """ self.deployment = deployment self.stack = stack self.environment = environment self.settings = settings self.client = Client() + self.shared_image_cache = shared_image_cache or {} - # Create Modal app for this pipeline - pipeline_name = deployment.pipeline_configuration.name.replace( - "_", "-" - ) - self.app_name = f"zenml-pipeline-{pipeline_name}" - self.app = modal.App.lookup( - self.app_name, - create_if_missing=True, - environment_name=settings.modal_environment, - ) + # Use shared app if provided, otherwise create new one + if shared_app: + self.app = shared_app + self.app_name = shared_app.name + else: + # Create Modal app for this pipeline + pipeline_name = deployment.pipeline_configuration.name.replace( + "_", "-" + ) + self.app_name = f"zenml-pipeline-{pipeline_name}" + self.app = modal.App.lookup( + self.app_name, + create_if_missing=True, + environment_name=settings.modal_environment, + ) def _build_entrypoint_command( self, base_command: List[str], args: List[str] @@ -217,15 +227,8 @@ async def _execute_sandbox( region = self.settings.region timeout = self.settings.timeout - # Get or build Modal image - image_name = self._get_image_name(step_name) - zenml_image = get_or_build_modal_image( - image_name=image_name, - stack=self.stack, - pipeline_name=self.deployment.pipeline_configuration.name, - build_id=str(self.deployment.build.id), - app=self.app, - ) + # Get or build Modal image (with shared cache support) + zenml_image = self._get_cached_or_build_image(step_name) # Create environment secret env_secret = self._create_environment_secret() @@ -321,6 +324,64 @@ def _get_image_name(self, step_name: Optional[str] = None) -> str: else: return ModalOrchestrator.get_image(deployment=self.deployment) + def _get_cached_or_build_image( + self, step_name: Optional[str] = None + ) -> Any: + """Get cached Modal image or build new one if not in cache. + + This method first checks the shared image cache for an existing image. + If found, it returns the cached image. Otherwise, it falls back to + the standard image building process. + + Args: + step_name: Name of the step (None for pipeline-level). + + Returns: + Modal image (either cached or newly built). + """ + image_name = self._get_image_name(step_name) + + # Check shared cache first + cache_key = self._get_image_cache_key(image_name, step_name) + if cache_key in self.shared_image_cache: + logger.info( + f"Using cached Modal image for {step_name or 'pipeline'}: {cache_key}" + ) + return self.shared_image_cache[cache_key] + + # Fallback to existing image building logic + logger.info( + f"Building new Modal image for {step_name or 'pipeline'}: {image_name}" + ) + return get_or_build_modal_image( + image_name=image_name, + stack=self.stack, + pipeline_name=self.deployment.pipeline_configuration.name, + build_id=str(self.deployment.build.id), + app=self.app, + ) + + def _get_image_cache_key( + self, image_name: str, step_name: Optional[str] = None + ) -> str: + """Generate a cache key for Modal images. + + Args: + image_name: The base image name. + step_name: Name of the step (None for pipeline-level). + + Returns: + Cache key for the image. + """ + # Use build ID and step name to create unique cache key + # Include a hash of the image name for uniqueness + build_id = str(self.deployment.build.id) + image_hash = str(hash(image_name))[-8:] # Last 8 chars of hash + if step_name: + return f"{build_id}_{step_name}_{image_hash}" + else: + return f"{build_id}_pipeline_{image_hash}" + async def execute_pipeline( self, orchestrator_run_id: str, From be58a00d0372a64c744e09fff47e4aa1a5366299 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 17:40:39 +0200 Subject: [PATCH 51/77] Ensure deployment build exists before image cache operations --- .../modal_orchestrator_entrypoint.py | 6 ++ .../orchestrators/modal_sandbox_executor.py | 78 +++++++++++++------ 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 572049f7b37..8fc8b6310b1 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -128,6 +128,12 @@ async def prepare_shared_image_cache( # Collect all unique images needed across all steps unique_images: Dict[str, str] = {} # cache_key -> image_name + # Check if deployment has a build + if deployment.build is None: + raise ValueError( + "Deployment build is None, cannot prepare image cache" + ) + # Add pipeline-level image if needed pipeline_image = ModalOrchestrator.get_image(deployment=deployment) build_id = str(deployment.build.id) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 849cf448a00..78de28b0bac 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -128,7 +128,9 @@ def _get_step_settings(self, step_name: str) -> ModalOrchestratorSettings: if hasattr(step_settings, "model_dump"): step_settings_data = step_settings.model_dump() else: - step_settings_data = step_settings + step_settings_data = ( + dict(step_settings) if step_settings else {} + ) step_modal_settings = ModalOrchestratorSettings.model_validate( step_settings_data @@ -178,22 +180,32 @@ def _get_resource_config( if step_name: step_settings = self._get_step_settings(step_name) step_config = self.deployment.step_configurations[step_name].config - resource_settings = step_config.settings.get(RESOURCE_SETTINGS_KEY) - gpu_values = get_gpu_values( - step_settings.gpu, resource_settings or ResourceSettings() + resource_settings_raw = step_config.settings.get( + RESOURCE_SETTINGS_KEY ) + + # Ensure we have a ResourceSettings object + if resource_settings_raw is None: + resource_settings = ResourceSettings() + elif isinstance(resource_settings_raw, ResourceSettings): + resource_settings = resource_settings_raw + else: + # Convert to ResourceSettings if it's a different type + resource_settings = ResourceSettings.model_validate( + resource_settings_raw.model_dump() + if hasattr(resource_settings_raw, "model_dump") + else dict(resource_settings_raw) + ) + + gpu_values = get_gpu_values(step_settings.gpu, resource_settings) else: # Pipeline-level resource settings resource_settings = get_resource_settings_from_deployment( self.deployment, RESOURCE_SETTINGS_KEY ) - gpu_values = get_gpu_values( - self.settings.gpu, resource_settings or ResourceSettings() - ) + gpu_values = get_gpu_values(self.settings.gpu, resource_settings) - cpu_count, memory_mb = get_resource_values( - resource_settings or ResourceSettings() - ) + cpu_count, memory_mb = get_resource_values(resource_settings) return gpu_values, cpu_count, memory_mb async def _execute_sandbox( @@ -286,18 +298,23 @@ async def _store_image_id(self, zenml_image: Any) -> None: # After sandbox creation, the image should be hydrated zenml_image.hydrate() if hasattr(zenml_image, "object_id") and zenml_image.object_id: - image_name_key = f"zenml_image_{self.deployment.build.id}" - - # Store the image ID in Modal's persistent storage - pipeline_name = self.deployment.pipeline_configuration.name - stored_id = modal.Dict.from_name( - f"zenml-image-cache-{pipeline_name}", - create_if_missing=True, - ) - stored_id[image_name_key] = zenml_image.object_id - logger.info( - f"Stored Modal image ID for build {self.deployment.build.id}" - ) + if self.deployment.build is not None: + image_name_key = f"zenml_image_{self.deployment.build.id}" + + # Store the image ID in Modal's persistent storage + pipeline_name = self.deployment.pipeline_configuration.name + stored_id = modal.Dict.from_name( + f"zenml-image-cache-{pipeline_name}", + create_if_missing=True, + ) + stored_id[image_name_key] = zenml_image.object_id + logger.info( + f"Stored Modal image ID for build {self.deployment.build.id}" + ) + else: + logger.warning( + "Deployment build is None, cannot store image ID" + ) else: logger.warning("Image not hydrated after sandbox creation") except Exception as e: @@ -353,6 +370,9 @@ def _get_cached_or_build_image( logger.info( f"Building new Modal image for {step_name or 'pipeline'}: {image_name}" ) + if self.deployment.build is None: + raise ValueError("Deployment build is None, cannot build image") + return get_or_build_modal_image( image_name=image_name, stack=self.stack, @@ -375,6 +395,11 @@ def _get_image_cache_key( """ # Use build ID and step name to create unique cache key # Include a hash of the image name for uniqueness + if self.deployment.build is None: + raise ValueError( + "Deployment build is None, cannot generate cache key" + ) + build_id = str(self.deployment.build.id) image_hash = str(hash(image_name))[-8:] # Last 8 chars of hash if step_name: @@ -401,11 +426,18 @@ async def execute_pipeline( command = ( ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() ) + from uuid import UUID + + # Convert run_id to UUID if it's a string + run_id_uuid = None + if run_id is not None: + run_id_uuid = UUID(run_id) if isinstance(run_id, str) else run_id + args = ( ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( deployment_id=self.deployment.id, orchestrator_run_id=orchestrator_run_id, - run_id=run_id or None, + run_id=run_id_uuid, ) ) entrypoint_command = self._build_entrypoint_command(command, args) From 1b2cfbe1d800b74e5b9ad99e1528c3dee9e38ed3 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 19:26:52 +0200 Subject: [PATCH 52/77] Refactor resource configuration methods for robustness --- .../orchestrators/modal_sandbox_executor.py | 222 +++++++++++++++--- src/zenml/integrations/modal/utils.py | 77 +++++- 2 files changed, 256 insertions(+), 43 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 78de28b0bac..7cb973242ed 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -166,48 +166,200 @@ def _create_environment_secret(self) -> Optional[Any]: } return modal.Secret.from_dict(env_dict) - def _get_resource_config( + def _get_resource_settings( self, step_name: Optional[str] = None - ) -> tuple[Optional[str], Optional[int], Optional[int]]: - """Get resource configuration for pipeline or step. + ) -> ResourceSettings: + """Get resource settings for pipeline or step with robust extraction. Args: step_name: Name of the step (None for pipeline-level). Returns: - Tuple of (gpu_values, cpu_count, memory_mb). + ResourceSettings object, never None. """ if step_name: - step_settings = self._get_step_settings(step_name) step_config = self.deployment.step_configurations[step_name].config - resource_settings_raw = step_config.settings.get( - RESOURCE_SETTINGS_KEY - ) - # Ensure we have a ResourceSettings object - if resource_settings_raw is None: - resource_settings = ResourceSettings() - elif isinstance(resource_settings_raw, ResourceSettings): - resource_settings = resource_settings_raw - else: - # Convert to ResourceSettings if it's a different type - resource_settings = ResourceSettings.model_validate( - resource_settings_raw.model_dump() - if hasattr(resource_settings_raw, "model_dump") - else dict(resource_settings_raw) + # Method 1: Direct access to resource_settings (preferred) + resource_settings = step_config.resource_settings + if resource_settings is not None: + logger.debug( + f"Using direct resource settings for step {step_name}" ) + return resource_settings - gpu_values = get_gpu_values(step_settings.gpu, resource_settings) + # Method 2: Look under "resources" key in settings + resource_settings_raw = step_config.settings.get( + RESOURCE_SETTINGS_KEY + ) + if resource_settings_raw is not None: + if isinstance(resource_settings_raw, ResourceSettings): + logger.debug( + f"Using resource settings from settings key for step {step_name}" + ) + return resource_settings_raw + else: + # Try to convert to ResourceSettings + try: + if hasattr(resource_settings_raw, "model_dump"): + resource_settings = ( + ResourceSettings.model_validate( + resource_settings_raw.model_dump() + ) + ) + elif hasattr(resource_settings_raw, "__dict__"): + resource_settings = ( + ResourceSettings.model_validate( + resource_settings_raw.__dict__ + ) + ) + else: + resource_settings = ( + ResourceSettings.model_validate( + dict(resource_settings_raw) + ) + ) + logger.debug( + f"Converted resource settings for step {step_name}" + ) + return resource_settings + except Exception as e: + logger.warning( + f"Failed to convert resource settings for step {step_name}: {e}. " + f"Using default ResourceSettings." + ) + + # Method 3: Default empty settings for step + logger.debug( + f"Using default resource settings for step {step_name}" + ) + return ResourceSettings() else: # Pipeline-level resource settings resource_settings = get_resource_settings_from_deployment( self.deployment, RESOURCE_SETTINGS_KEY ) + logger.debug("Using pipeline-level resource settings") + return resource_settings + + def _get_resource_config( + self, step_name: Optional[str] = None + ) -> tuple[Optional[str], Optional[int], Optional[int]]: + """Get validated resource configuration for pipeline or step. + + Args: + step_name: Name of the step (None for pipeline-level). + + Returns: + Tuple of (gpu_values, cpu_count, memory_mb) with validated values. + """ + # Get resource settings using robust extraction + resource_settings = self._get_resource_settings(step_name) + + # Get GPU configuration + if step_name: + step_settings = self._get_step_settings(step_name) + gpu_values = get_gpu_values(step_settings.gpu, resource_settings) + else: gpu_values = get_gpu_values(self.settings.gpu, resource_settings) + # Get CPU and memory with validation cpu_count, memory_mb = get_resource_values(resource_settings) + + # Log resource configuration for debugging + logger.debug( + f"Resource config for {step_name or 'pipeline'}: " + f"GPU={gpu_values}, CPU={cpu_count}, Memory={memory_mb}MB" + ) + return gpu_values, cpu_count, memory_mb + def _prepare_modal_api_params( + self, + entrypoint_command: List[str], + image: Any, + gpu: Optional[str], + cpu: Optional[int], + memory: Optional[int], + cloud: Optional[str], + region: Optional[str], + app: Any, + timeout: int, + secrets: List[Any], + ) -> Dict[str, Any]: + """Prepare and validate Modal API parameters. + + This method ensures that all parameters passed to Modal API are valid + and handles None values appropriately. + + Args: + entrypoint_command: Command to execute. + image: Modal image. + gpu: GPU configuration string. + cpu: CPU count. + memory: Memory in MB. + cloud: Cloud provider. + region: Cloud region. + app: Modal app. + timeout: Timeout in seconds. + secrets: List of Modal secrets. + + Returns: + Dictionary of validated parameters for Modal API. + + Raises: + ValueError: If required parameters are invalid. + """ + if not entrypoint_command: + raise ValueError("Entrypoint command cannot be empty") + + if image is None: + raise ValueError("Modal image is required") + + if timeout <= 0: + raise ValueError(f"Timeout must be positive, got {timeout}") + + # Build parameters dictionary + # Note: entrypoint_command will be passed as *args separately + params = { + "image": image, + "app": app, + "timeout": timeout, + } + + # Add optional parameters only if they have valid values + if gpu is not None: + # Validate GPU format + if isinstance(gpu, str) and gpu.strip(): + params["gpu"] = gpu + else: + logger.warning(f"Invalid GPU value '{gpu}', ignoring") + + if cpu is not None and cpu > 0: + params["cpu"] = cpu + + if memory is not None and memory > 0: + params["memory"] = memory + + if cloud is not None and cloud.strip(): + params["cloud"] = cloud + + if region is not None and region.strip(): + params["region"] = region + + if secrets: + params["secrets"] = secrets + + # Log final parameters for debugging + param_summary = { + k: v + for k, v in params.items() + if k not in ["image", "app", "secrets"] # Skip complex objects + } + logger.debug(f"Modal sandbox parameters: {param_summary}") + + return params + async def _execute_sandbox( self, entrypoint_command: List[str], @@ -225,7 +377,7 @@ async def _execute_sandbox( run_id: Pipeline run ID for tagging. synchronous: Whether to wait for completion. """ - # Get resource configuration + # Get resource configuration with validation gpu_values, cpu_count, memory_mb = self._get_resource_config(step_name) # Get settings (step-specific for steps, pipeline-level for pipeline) @@ -258,19 +410,25 @@ async def _execute_sandbox( logger.info(f"Creating sandbox for {execution_mode.lower()} execution") logger.info(f"Sandbox tags: {tags}") + # Validate and prepare Modal API parameters + modal_params = self._prepare_modal_api_params( + entrypoint_command=entrypoint_command, + image=zenml_image, + gpu=gpu_values, + cpu=cpu_count, + memory=memory_mb, + cloud=cloud, + region=region, + app=self.app, + timeout=timeout, + secrets=secrets, + ) + with modal.enable_output(): - # Create sandbox with environment variables passed as secrets + # Create sandbox with validated parameters + # Pass entrypoint command as positional args and others as kwargs sb = await modal.Sandbox.create.aio( - *entrypoint_command, - image=zenml_image, - gpu=gpu_values, - cpu=cpu_count, - memory=memory_mb, - cloud=cloud, - region=region, - app=self.app, - timeout=timeout, - secrets=secrets, + *entrypoint_command, **modal_params ) # Set tags diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index d6f9b7d06eb..7b9f872d158 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -151,43 +151,98 @@ def get_gpu_values( ) -> Optional[str]: """Get the GPU values for Modal components. + This function unifies GPU configuration from both Modal orchestrator settings + and ResourceSettings. It prioritizes explicit GPU type from Modal settings, + but falls back to ResourceSettings for GPU count and type. + Args: - gpu_type: The GPU type (e.g., "T4", "A100"). - resource_settings: The resource settings. + gpu_type: The GPU type from Modal settings (e.g., "T4", "A100"). + resource_settings: The resource settings containing GPU configuration. Returns: - The GPU string if a count is specified, otherwise the GPU type. + The GPU string for Modal API, or None if no GPU requested. + Format: "GPU_TYPE" or "GPU_TYPE:COUNT" """ + # Check if GPU is requested via ResourceSettings + gpu_count = resource_settings.gpu_count + + # If no GPU type specified but GPU count > 0, try to infer from ResourceSettings + if not gpu_type and gpu_count and gpu_count > 0: + # Check if ResourceSettings has gpu_type (some versions might support this) + if ( + hasattr(resource_settings, "gpu_type") + and resource_settings.gpu_type + ): + gpu_type = resource_settings.gpu_type + else: + # Default to a reasonable GPU type if count is specified + logger.warning( + f"GPU count ({gpu_count}) specified but no GPU type provided. " + "Defaulting to 'T4'. Consider specifying gpu_type in Modal orchestrator settings." + ) + gpu_type = "T4" + + # No GPU requested if not gpu_type: return None - gpu_count = resource_settings.gpu_count + + # GPU type specified but no count, default to 1 if gpu_count is None or gpu_count == 0: return gpu_type + + # Both type and count specified return f"{gpu_type}:{gpu_count}" def get_resource_values( resource_settings: ResourceSettings, ) -> Tuple[Optional[int], Optional[int]]: - """Get CPU and memory values from resource settings. + """Get CPU and memory values from resource settings with validation. Args: resource_settings: The resource settings. Returns: - Tuple of (cpu_count, memory_mb). + Tuple of (cpu_count, memory_mb) with validated values. """ - # Get CPU count + # Get CPU count with validation cpu_count: Optional[int] = None if resource_settings.cpu_count is not None: cpu_count = int(resource_settings.cpu_count) + # Validate CPU count is reasonable + if cpu_count <= 0: + logger.warning(f"Invalid CPU count {cpu_count}, ignoring.") + cpu_count = None + elif cpu_count > 96: # Modal's typical max + logger.warning( + f"CPU count {cpu_count} is very high. " + "Consider if this is intentional." + ) - # Convert memory to MB if needed + # Convert memory to MB if needed with validation memory_mb: Optional[int] = None if resource_settings.memory: - memory_value = resource_settings.get_memory(ByteUnit.MB) - if memory_value is not None: - memory_mb = int(memory_value) + try: + memory_value = resource_settings.get_memory(ByteUnit.MB) + if memory_value is not None: + memory_mb = int(memory_value) + # Validate memory is reasonable + if memory_mb <= 0: + logger.warning(f"Invalid memory {memory_mb}MB, ignoring.") + memory_mb = None + elif memory_mb < 128: # Less than 128MB seems too low + logger.warning( + f"Memory {memory_mb}MB is very low. " + "Consider if this is intentional." + ) + elif memory_mb > 1024 * 1024: # More than 1TB seems high + logger.warning( + f"Memory {memory_mb}MB is very high. " + "Consider if this is intentional." + ) + except Exception as e: + logger.warning(f"Failed to parse memory setting: {e}") + memory_mb = None return cpu_count, memory_mb From c739f88aaaa4644dc4f8eb630f8d4e6ee504cc2d Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 22:44:48 +0200 Subject: [PATCH 53/77] Refactor resource settings conversion and GPU type handling --- .../orchestrators/modal_sandbox_executor.py | 46 ++++++++++++------- src/zenml/integrations/modal/utils.py | 18 +------- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 7cb973242ed..83dfe1427d1 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -201,31 +201,45 @@ def _get_resource_settings( else: # Try to convert to ResourceSettings try: + # Handle different types of settings objects if hasattr(resource_settings_raw, "model_dump"): - resource_settings = ( - ResourceSettings.model_validate( - resource_settings_raw.model_dump() - ) - ) + # Pydantic model - convert via model_dump + settings_dict = resource_settings_raw.model_dump() elif hasattr(resource_settings_raw, "__dict__"): - resource_settings = ( - ResourceSettings.model_validate( - resource_settings_raw.__dict__ - ) - ) + # Object with attributes - convert via __dict__ + settings_dict = resource_settings_raw.__dict__ + elif hasattr( + resource_settings_raw, "__iter__" + ) and not isinstance(resource_settings_raw, str): + # Dict-like object - convert to dict + settings_dict = dict(resource_settings_raw) else: - resource_settings = ( - ResourceSettings.model_validate( - dict(resource_settings_raw) - ) - ) + # Fallback - try direct conversion + settings_dict = dict(resource_settings_raw) + + # Filter out None values and non-resource fields + filtered_dict = {} + valid_fields = { + "cpu_count", + "memory", + "gpu_count", + "gpu_type", + } + for key, value in settings_dict.items(): + if key in valid_fields and value is not None: + filtered_dict[key] = value + + resource_settings = ResourceSettings.model_validate( + filtered_dict + ) logger.debug( - f"Converted resource settings for step {step_name}" + f"Converted resource settings for step {step_name}: {filtered_dict}" ) return resource_settings except Exception as e: logger.warning( f"Failed to convert resource settings for step {step_name}: {e}. " + f"Type: {type(resource_settings_raw)}. " f"Using default ResourceSettings." ) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 7b9f872d158..d0ae607fc78 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -166,23 +166,7 @@ def get_gpu_values( # Check if GPU is requested via ResourceSettings gpu_count = resource_settings.gpu_count - # If no GPU type specified but GPU count > 0, try to infer from ResourceSettings - if not gpu_type and gpu_count and gpu_count > 0: - # Check if ResourceSettings has gpu_type (some versions might support this) - if ( - hasattr(resource_settings, "gpu_type") - and resource_settings.gpu_type - ): - gpu_type = resource_settings.gpu_type - else: - # Default to a reasonable GPU type if count is specified - logger.warning( - f"GPU count ({gpu_count}) specified but no GPU type provided. " - "Defaulting to 'T4'. Consider specifying gpu_type in Modal orchestrator settings." - ) - gpu_type = "T4" - - # No GPU requested + # No GPU requested if no type specified if not gpu_type: return None From 5d7a44d6f6d0c0ac2933eb7e00d0ad3543d7bb65 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 23:02:51 +0200 Subject: [PATCH 54/77] Refactor resource settings extraction for clarity --- .../orchestrators/modal_sandbox_executor.py | 161 ++++++++++-------- 1 file changed, 86 insertions(+), 75 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 83dfe1427d1..46bb83ef54a 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -106,6 +106,64 @@ def _build_entrypoint_command( """ return base_command + args + # --------------------------------------------------------------------- + # Resource utilities + # --------------------------------------------------------------------- + + @staticmethod + def _to_resource_settings(data: Any | None) -> ResourceSettings: + """Convert arbitrary input to a ``ResourceSettings`` object. + + This helper makes sure that we *always* return a properly validated + ``ResourceSettings`` instance. It gracefully handles different shapes + that may appear in historical deployments (actual instance, pydantic + model, plain dict, or even a generic object with ``__dict__``). + + Args: + data: Raw resource settings information. + + Returns: + A validated ``ResourceSettings`` instance (empty when no data). + """ + # Already a ResourceSettings – just return + if isinstance(data, ResourceSettings): + return data + + # Nothing configured – return an empty instance + if data is None: + return ResourceSettings() + + # Convert pydantic/BaseSettings models to dict first + if hasattr(data, "model_dump"): + try: + data = data.model_dump() + except Exception: + # Fallback to __dict__ if model_dump fails for some reason + data = getattr(data, "__dict__", {}) + + # Convert mapping-like objects to dict + if not isinstance(data, dict): + try: + data = dict(data) # type: ignore[arg-type] + except Exception: + # If conversion fails, return empty settings instead of error + logger.warning( + "Unable to interpret resource settings of type %s – falling back to default.", + type(data), + ) + return ResourceSettings() + + # Finally validate + try: + return ResourceSettings.model_validate(data) # type: ignore[arg-type] + except Exception as e: + logger.warning( + "Failed to validate resource settings %s – %s. Using default.", + data, + e, + ) + return ResourceSettings() + def _get_step_settings(self, step_name: str) -> ModalOrchestratorSettings: """Get merged settings for a specific step. @@ -169,92 +227,45 @@ def _create_environment_secret(self) -> Optional[Any]: def _get_resource_settings( self, step_name: Optional[str] = None ) -> ResourceSettings: - """Get resource settings for pipeline or step with robust extraction. - - Args: - step_name: Name of the step (None for pipeline-level). + """Return validated resource settings for a pipeline or a step. - Returns: - ResourceSettings object, never None. + The previous implementation tried multiple ad-hoc extraction methods. + We now delegate the heavy lifting to ``_to_resource_settings`` which + guarantees a valid object and dramatically reduces the branching. """ if step_name: - step_config = self.deployment.step_configurations[step_name].config + step_cfg = self.deployment.step_configurations[step_name].config - # Method 1: Direct access to resource_settings (preferred) - resource_settings = step_config.resource_settings - if resource_settings is not None: + # 1) direct attribute + res = self._to_resource_settings(step_cfg.resource_settings) + if not res.empty: logger.debug( - f"Using direct resource settings for step {step_name}" + "Using direct resource settings for step %s", step_name ) - return resource_settings + return res - # Method 2: Look under "resources" key in settings - resource_settings_raw = step_config.settings.get( - RESOURCE_SETTINGS_KEY + # 2) settings["resources"] fallback + res = self._to_resource_settings( + step_cfg.settings.get(RESOURCE_SETTINGS_KEY) ) - if resource_settings_raw is not None: - if isinstance(resource_settings_raw, ResourceSettings): - logger.debug( - f"Using resource settings from settings key for step {step_name}" - ) - return resource_settings_raw - else: - # Try to convert to ResourceSettings - try: - # Handle different types of settings objects - if hasattr(resource_settings_raw, "model_dump"): - # Pydantic model - convert via model_dump - settings_dict = resource_settings_raw.model_dump() - elif hasattr(resource_settings_raw, "__dict__"): - # Object with attributes - convert via __dict__ - settings_dict = resource_settings_raw.__dict__ - elif hasattr( - resource_settings_raw, "__iter__" - ) and not isinstance(resource_settings_raw, str): - # Dict-like object - convert to dict - settings_dict = dict(resource_settings_raw) - else: - # Fallback - try direct conversion - settings_dict = dict(resource_settings_raw) - - # Filter out None values and non-resource fields - filtered_dict = {} - valid_fields = { - "cpu_count", - "memory", - "gpu_count", - "gpu_type", - } - for key, value in settings_dict.items(): - if key in valid_fields and value is not None: - filtered_dict[key] = value - - resource_settings = ResourceSettings.model_validate( - filtered_dict - ) - logger.debug( - f"Converted resource settings for step {step_name}: {filtered_dict}" - ) - return resource_settings - except Exception as e: - logger.warning( - f"Failed to convert resource settings for step {step_name}: {e}. " - f"Type: {type(resource_settings_raw)}. " - f"Using default ResourceSettings." - ) - - # Method 3: Default empty settings for step + if not res.empty: + logger.debug( + "Using settings-key resource settings for step %s", + step_name, + ) + return res + logger.debug( - f"Using default resource settings for step {step_name}" + "No resource settings for step %s – defaulting", step_name ) return ResourceSettings() - else: - # Pipeline-level resource settings - resource_settings = get_resource_settings_from_deployment( - self.deployment, RESOURCE_SETTINGS_KEY - ) - logger.debug("Using pipeline-level resource settings") - return resource_settings + + # Pipeline-level: delegate to existing util (already returns RS) + resource_settings = get_resource_settings_from_deployment( + self.deployment, RESOURCE_SETTINGS_KEY + ) + logger.debug("Using pipeline-level resource settings") + return resource_settings def _get_resource_config( self, step_name: Optional[str] = None From 7685c4694fc05bcf08baafcc1b5c48bad36a72d7 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 23:07:29 +0200 Subject: [PATCH 55/77] Refactor GPU configuration handling, handle missing type --- .../orchestrators/modal_sandbox_executor.py | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 46bb83ef54a..4872f879416 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -144,7 +144,7 @@ def _to_resource_settings(data: Any | None) -> ResourceSettings: # Convert mapping-like objects to dict if not isinstance(data, dict): try: - data = dict(data) # type: ignore[arg-type] + data = dict(data) except Exception: # If conversion fails, return empty settings instead of error logger.warning( @@ -155,7 +155,7 @@ def _to_resource_settings(data: Any | None) -> ResourceSettings: # Finally validate try: - return ResourceSettings.model_validate(data) # type: ignore[arg-type] + return ResourceSettings.model_validate(data) except Exception as e: logger.warning( "Failed to validate resource settings %s – %s. Using default.", @@ -281,13 +281,27 @@ def _get_resource_config( # Get resource settings using robust extraction resource_settings = self._get_resource_settings(step_name) - # Get GPU configuration + # Get GPU configuration (with default type if unspecified but gpu_count > 0) + gpu_type: Optional[str] = None if step_name: step_settings = self._get_step_settings(step_name) - gpu_values = get_gpu_values(step_settings.gpu, resource_settings) + gpu_type = step_settings.gpu else: - gpu_values = get_gpu_values(self.settings.gpu, resource_settings) + gpu_type = self.settings.gpu + + # If gpu_type is missing but gpu_count > 0, default to T4 + if ( + gpu_type is None + and resource_settings.gpu_count + and resource_settings.gpu_count > 0 + ): + gpu_type = "T4" + logger.debug( + f"No GPU type specified for {'step ' + step_name if step_name else 'pipeline'}, " + f"but gpu_count={resource_settings.gpu_count}. Defaulting to {gpu_type}." + ) + gpu_values = get_gpu_values(gpu_type, resource_settings) # Get CPU and memory with validation cpu_count, memory_mb = get_resource_values(resource_settings) From d9fbeb223382ab03ded7882e29e6ae252fc5e82f Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 23:26:35 +0200 Subject: [PATCH 56/77] Use sane defaults for pipeline resource settings --- src/zenml/integrations/modal/utils.py | 49 ++++++++------------------- 1 file changed, 14 insertions(+), 35 deletions(-) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index d0ae607fc78..e7bce3f2f3e 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -489,39 +489,18 @@ def get_resource_settings_from_deployment( pipeline_resource_dict ) else: - # Fallback to highest resource requirements across all steps - if deployment.step_configurations: - # Find step with highest resource requirements for modal execution - max_cpu = 0 - max_memory = 0 - max_gpu = 0 - best_step_resources = ResourceSettings() - - for step_config in deployment.step_configurations.values(): - step_resources = step_config.config.resource_settings - step_cpu = step_resources.cpu_count or 0 - step_memory = step_resources.get_memory() or 0 - step_gpu = step_resources.gpu_count or 0 - - # Calculate resource "score" to find most demanding step - resource_score = ( - step_cpu + (step_memory / 1024) + (step_gpu * 10) - ) - best_score = max_cpu + (max_memory / 1024) + (max_gpu * 10) - - if resource_score > best_score: - max_cpu = step_cpu - max_memory = step_memory - max_gpu = step_gpu - best_step_resources = step_resources - - logger.info( - f"No pipeline-level resource settings found. Using highest resource " - f"requirements from steps: {max_cpu} CPUs, {max_memory / 1024:.1f}GB memory, " - f"{max_gpu} GPUs" - ) - resource_settings = best_step_resources - else: - resource_settings = ResourceSettings() # Default empty settings - + # No explicit pipeline resources: use sane defaults (ignore step-level) + # As per user request: for pipeline mode, do not fallback to max(step resources) + resource_settings = ResourceSettings( + cpu_count=1, + memory="1024MB", + gpu_count=0, + ) + logger.info( + "No explicit pipeline-level resource settings found. " + "Using sane defaults: %s CPU, %s memory, %s GPU", + resource_settings.cpu_count, + resource_settings.memory, + resource_settings.gpu_count, + ) return resource_settings From 4fb95b286261c0bf7dadd822e0f3c5b67d683b04 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sat, 12 Jul 2025 23:29:07 +0200 Subject: [PATCH 57/77] Refactor function docstrings for better clarity --- .../modal_orchestrator_entrypoint.py | 17 +++++++++++-- .../orchestrators/modal_sandbox_executor.py | 25 ++++++++++++++++--- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 8fc8b6310b1..67b33008121 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -106,6 +106,10 @@ async def prepare_shared_image_cache( Returns: Tuple of (shared_image_cache, shared_modal_app). + + Raises: + ValueError: If the deployment has no associated build information. + Exception: For any unexpected error while building images. """ from zenml.integrations.modal.orchestrators.modal_orchestrator import ( ModalOrchestrator, @@ -226,11 +230,20 @@ def execute_per_step_mode( ) def run_step_wrapper(step_name: str) -> None: - """Wrapper to execute step with shared resources.""" + """Wrapper to execute a single pipeline step. + + Args: + step_name: Name of the step to execute. + """ run_step_on_modal(step_name, shared_executor) def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: - """Wrapper to finalize pipeline execution.""" + """Wrapper to finalize pipeline execution. + + Args: + node_states: Mapping of node/step names to their execution + status after DAG completion. + """ finalize_run(node_states, args) # Build DAG from deployment diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 4872f879416..91dc3211aa0 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -227,11 +227,20 @@ def _create_environment_secret(self) -> Optional[Any]: def _get_resource_settings( self, step_name: Optional[str] = None ) -> ResourceSettings: - """Return validated resource settings for a pipeline or a step. + """Return validated resource settings for either pipeline or step. - The previous implementation tried multiple ad-hoc extraction methods. - We now delegate the heavy lifting to ``_to_resource_settings`` which - guarantees a valid object and dramatically reduces the branching. + The helper always returns a *proper* :class:`~zenml.config.resource_settings.ResourceSettings` + instance. For a step it checks the step-level settings first and + falls back to an empty configuration; for the pipeline it delegates + to :func:`zenml.integrations.modal.utils.get_resource_settings_from_deployment`. + + Args: + step_name: Optional name of the step for which to fetch resource + settings. If ``None`` (default), pipeline-level settings are + returned. + + Returns: + A validated ``ResourceSettings`` object (never ``None``). """ if step_name: step_cfg = self.deployment.step_configurations[step_name].config @@ -552,6 +561,10 @@ def _get_cached_or_build_image( Returns: Modal image (either cached or newly built). + + Raises: + ValueError: If the deployment does not have an associated build + (required to identify the Docker image). """ image_name = self._get_image_name(step_name) @@ -589,6 +602,10 @@ def _get_image_cache_key( Returns: Cache key for the image. + + Raises: + ValueError: If the deployment does not have a build ID which is + required to scope the cache key. """ # Use build ID and step name to create unique cache key # Include a hash of the image name for uniqueness From c410179093700a81ef5e6babf8ec206582ffde33 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 13 Jul 2025 17:22:41 +0200 Subject: [PATCH 58/77] Refactor Modal image building logic for clarity --- src/zenml/integrations/modal/utils.py | 155 ++++++++++++-------------- 1 file changed, 73 insertions(+), 82 deletions(-) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index e7bce3f2f3e..a8044b81d3a 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -16,10 +16,7 @@ import os from typing import Any, Dict, List, Optional, Tuple, Union -try: - import modal -except ImportError: - modal = None # type: ignore +import modal from zenml.config import ResourceSettings from zenml.config.resource_settings import ByteUnit @@ -170,8 +167,11 @@ def get_gpu_values( if not gpu_type: return None + # No GPU requested if count is explicitly 0 + if gpu_count == 0: + return None # GPU type specified but no count, default to 1 - if gpu_count is None or gpu_count == 0: + if gpu_count is None: return gpu_type # Both type and count specified @@ -231,21 +231,21 @@ def get_resource_values( return cpu_count, memory_mb -def get_or_build_modal_image( +def _build_modal_image_from_registry( image_name: str, stack: "Stack", - pipeline_name: str, - build_id: str, - app: Any, + environment: Optional[Dict[str, str]] = None, ) -> Any: - """Get existing Modal image or build new one based on pipeline name and build ID. + """Build a Modal image from a Docker registry with authentication. + + This helper function centralizes the shared logic for building Modal images + from Docker registries, including credential validation, secret creation, + and image building with Modal installation. Args: image_name: The name of the Docker image to use as base. stack: The ZenML stack containing container registry. - pipeline_name: The pipeline name for caching. - build_id: The build ID for the image key. - app: The Modal app to store/retrieve images. + environment: Optional environment variables to apply to the image. Returns: The configured Modal image. @@ -261,31 +261,6 @@ def get_or_build_modal_image( "it is correctly configured." ) - # Try to get existing image from the app - image_name_key = f"zenml_image_{build_id}" - - try: - # Try to look up existing image by ID from Modal's object store - # We'll store the image ID as a Modal object for persistence - try: - # Try to get stored image ID - stored_id = modal.Dict.from_name( - f"zenml-image-cache-{pipeline_name}" - ) - if image_name_key in stored_id: - image_id = stored_id[image_name_key] - existing_image = modal.Image.from_id(image_id) - logger.info( - f"Using cached Modal image for build {build_id} in pipeline {pipeline_name}" - ) - return existing_image - except Exception: - # Dict doesn't exist or image not found, will build new one - pass - except Exception: - # If lookup fails, we'll build a new image - pass - logger.info("Building new Modal image") logger.info(f"Base image: {image_name}") @@ -304,11 +279,8 @@ def get_or_build_modal_image( } ) - # Build new Modal image and register it with consistent name - logger.info(f"🔨 Building Modal image from base: {image_name}") - # Build Modal image from the ZenML-built image - # Modal will automatically cache layers and reuse when possible + logger.info(f"🔨 Building Modal image from base: {image_name}") logger.info(f"Creating Modal image from base: {image_name}") zenml_image = ( modal.Image.from_registry( @@ -316,23 +288,28 @@ def get_or_build_modal_image( ).pip_install("modal") # Install Modal in the container ) - # Store the image in the app for future use - setattr(app, image_name_key, zenml_image) + # Apply environment variables if provided + if environment: + zenml_image = zenml_image.env(environment) return zenml_image -def build_modal_image( +def get_or_build_modal_image( image_name: str, stack: "Stack", - environment: Optional[Dict[str, str]] = None, + pipeline_name: str, + build_id: str, + app: Any, ) -> Any: - """Build a Modal image from a ZenML-built Docker image. + """Get existing Modal image or build new one based on pipeline name and build ID. Args: image_name: The name of the Docker image to use as base. stack: The ZenML stack containing container registry. - environment: The environment variables to pass to the image. + pipeline_name: The pipeline name for caching. + build_id: The build ID for the image key. + app: The Modal app to store/retrieve images. Returns: The configured Modal image. @@ -341,47 +318,61 @@ def build_modal_image( RuntimeError: If no Docker credentials are found. ValueError: If no container registry is found. """ - if not stack.container_registry: - raise ValueError( - "No Container registry found in the stack. " - "Please add a container registry and ensure " - "it is correctly configured." - ) - - logger.info("Building Modal image") - logger.info(f"Base image: {image_name}") + # Try to get existing image from the app + image_name_key = f"zenml_image_{build_id}" - if docker_creds := stack.container_registry.credentials: - docker_username, docker_password = docker_creds - else: - raise RuntimeError( - "No Docker credentials found for the container registry." - ) + try: + # Try to get stored image ID + stored_id = modal.Dict.from_name(f"zenml-image-cache-{pipeline_name}") + if image_name_key in stored_id: + image_id = stored_id[image_name_key] + existing_image = modal.Image.from_id(image_id) + logger.info( + f"Using cached Modal image for build {build_id} in pipeline {pipeline_name}" + ) + return existing_image + except (modal.exceptions.NotFoundError, KeyError): + # Dict doesn't exist or image not found, will build new one + pass - # Create Modal secret for registry authentication - registry_secret = modal.Secret.from_dict( - { - "REGISTRY_USERNAME": docker_username, - "REGISTRY_PASSWORD": docker_password, - } + # Build new image using shared helper + zenml_image = _build_modal_image_from_registry( + image_name=image_name, + stack=stack, + environment=None, # No environment variables for cached images ) - # Build new Modal image and register it with consistent name - logger.info(f"🔨 Building Modal image from base: {image_name}") + # Store the image in the app for future use + setattr(app, image_name_key, zenml_image) - # Build Modal image from the ZenML-built image - # Modal will automatically cache layers and reuse when possible - logger.info(f"Creating Modal image from base: {image_name}") - zenml_image = ( - modal.Image.from_registry( - image_name, secret=registry_secret - ).pip_install("modal") # Install Modal in the container - ) + return zenml_image - if environment: - zenml_image = zenml_image.env(environment) - return zenml_image +def build_modal_image( + image_name: str, + stack: "Stack", + environment: Optional[Dict[str, str]] = None, +) -> Any: + """Build a Modal image from a ZenML-built Docker image. + + Args: + image_name: The name of the Docker image to use as base. + stack: The ZenML stack containing container registry. + environment: The environment variables to pass to the image. + + Returns: + The configured Modal image. + + Raises: + RuntimeError: If no Docker credentials are found. + ValueError: If no container registry is found. + """ + # Use shared helper for image building + return _build_modal_image_from_registry( + image_name=image_name, + stack=stack, + environment=environment, + ) def generate_sandbox_tags( From 6387d42a491e62c24e745e49aa4456b8a2796a4d Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 13 Jul 2025 21:24:48 +0200 Subject: [PATCH 59/77] Update README.md links for LLM-Complete Guide project --- docs/book/user-guide/README.md | 2 +- src/zenml/integrations/modal/utils.py | 11 ++--------- .../modal/step_operators/test_modal_step_operator.py | 11 +++++++++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/book/user-guide/README.md b/docs/book/user-guide/README.md index 945a7db2920..e666dc4f401 100644 --- a/docs/book/user-guide/README.md +++ b/docs/book/user-guide/README.md @@ -17,7 +17,7 @@ Step-by-step instructions to help you master ZenML concepts and features. Complete end-to-end implementations that showcase ZenML in real-world scenarios.\ [See all projects in our website →](https://www.zenml.io/projects) -
ZenCoderYour Own MLOps Engineerzencoder.jpghttps://www.zenml.io/projects/zencoder-your-own-mlops-engineer
LLM-Complete GuideProduction-ready RAG pipelines from basic retrieval to advanced LLMOps with embeddings finetuning and evals.llm-complete-guide.jpghttps://www.zenml.io/projects/llm-complete-guide
NightWatchAI Database Summaries While You Sleepnightwatch.jpghttps://www.zenml.io/projects/nightwatch-ai-database-summaries-while-you-sleep
Research RadarAutomates research paper discovery and classification for specialized research domains.researchradar.jpg
Magic PhotoboothA personalized AI image generation product that can create your avatars from a selfie.magicphoto.jpghttps://www.zenml.io/projects/magic-photobooth
Sign Language Detection with YOLOv5End-to-end computer vision pipelineyolo.jpghttps://www.zenml.io/projects/sign-language-detection-with-yolov5
ZenML Support AgentA production-ready agent that can help you with your ZenML questions.support.jpghttps://www.zenml.io/projects/zenml-support-agent
GameSenseThe LLM That Understands Gamersgamesense.jpghttps://www.zenml.io/projects/gamesense-the-llm-that-understands-gamers
EuroRate PredictorTurn European Central Bank data into actionable interest rate forecasts with this comprehensive MLOps solution.eurorate.jpghttps://www.zenml.io/projects/eurorate-predictor
+
ZenCoderYour Own MLOps Engineerzencoder.jpghttps://www.zenml.io/projects/zencoder-your-own-mlops-engineer
LLM-Complete GuideProduction-ready RAG pipelines from basic retrieval to advanced LLMOps with embeddings finetuning and evals.llm-complete-guide.jpghttps://github.com/zenml-io/zenml-projects/tree/main/llm-complete-guide
NightWatchAI Database Summaries While You Sleepnightwatch.jpghttps://www.zenml.io/projects/nightwatch-ai-database-summaries-while-you-sleep
Research RadarAutomates research paper discovery and classification for specialized research domains.researchradar.jpg
Magic PhotoboothA personalized AI image generation product that can create your avatars from a selfie.magicphoto.jpghttps://www.zenml.io/projects/magic-photobooth
Sign Language Detection with YOLOv5End-to-end computer vision pipelineyolo.jpghttps://www.zenml.io/projects/sign-language-detection-with-yolov5
ZenML Support AgentA production-ready agent that can help you with your ZenML questions.support.jpghttps://www.zenml.io/projects/zenml-support-agent
GameSenseThe LLM That Understands Gamersgamesense.jpghttps://www.zenml.io/projects/gamesense-the-llm-that-understands-gamers
EuroRate PredictorTurn European Central Bank data into actionable interest rate forecasts with this comprehensive MLOps solution.eurorate.jpghttps://www.zenml.io/projects/eurorate-predictor
## Examples diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index a8044b81d3a..43824319cfa 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -170,7 +170,8 @@ def get_gpu_values( # No GPU requested if count is explicitly 0 if gpu_count == 0: return None - # GPU type specified but no count, default to 1 + + # GPU type specified but no count, return just the type if gpu_count is None: return gpu_type @@ -313,10 +314,6 @@ def get_or_build_modal_image( Returns: The configured Modal image. - - Raises: - RuntimeError: If no Docker credentials are found. - ValueError: If no container registry is found. """ # Try to get existing image from the app image_name_key = f"zenml_image_{build_id}" @@ -362,10 +359,6 @@ def build_modal_image( Returns: The configured Modal image. - - Raises: - RuntimeError: If no Docker credentials are found. - ValueError: If no container registry is found. """ # Use shared helper for image building return _build_modal_image_from_registry( diff --git a/tests/integration/integrations/modal/step_operators/test_modal_step_operator.py b/tests/integration/integrations/modal/step_operators/test_modal_step_operator.py index b418f72f6d7..a0325994f68 100644 --- a/tests/integration/integrations/modal/step_operators/test_modal_step_operator.py +++ b/tests/integration/integrations/modal/step_operators/test_modal_step_operator.py @@ -12,6 +12,12 @@ # or implied. See the License for the specific language governing # permissions and limitations under the License. +"""Integration tests for the Modal step operator utilities. + +This module verifies helper functions inside the Modal step operator flavor, +specifically the ``get_gpu_values`` helper which converts GPU type/count pairs +into the string format expected by the Modal SDK. +""" import pytest @@ -30,16 +36,17 @@ ("", 1, None), (None, 1, None), ("A100", None, "A100"), - ("A100", 0, "A100"), + ("A100", 0, None), ("A100", 1, "A100:1"), ("A100", 2, "A100:2"), ("V100", None, "V100"), - ("V100", 0, "V100"), + ("V100", 0, None), ("V100", 1, "V100:1"), ("V100", 2, "V100:2"), ], ) def test_get_gpu_values(gpu, gpu_count, expected_result): + """Test the get_gpu_values function.""" settings = ModalStepOperatorSettings(gpu=gpu) resource_settings = ResourceSettings(gpu_count=gpu_count) result = get_gpu_values(settings.gpu, resource_settings) From 083320f6bf182f7c29e75ccf61d88b9d837b2a07 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 13 Jul 2025 21:24:56 +0200 Subject: [PATCH 60/77] Add Modal orchestrator integration tests --- .../modal/orchestrators/__init__.py | 14 + .../test_modal_sandbox_executor.py | 243 ++++++++++++++++++ 2 files changed, 257 insertions(+) create mode 100644 tests/integration/integrations/modal/orchestrators/__init__.py create mode 100644 tests/integration/integrations/modal/orchestrators/test_modal_sandbox_executor.py diff --git a/tests/integration/integrations/modal/orchestrators/__init__.py b/tests/integration/integrations/modal/orchestrators/__init__.py new file mode 100644 index 00000000000..47d03e07fbd --- /dev/null +++ b/tests/integration/integrations/modal/orchestrators/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Modal orchestrator integration tests.""" \ No newline at end of file diff --git a/tests/integration/integrations/modal/orchestrators/test_modal_sandbox_executor.py b/tests/integration/integrations/modal/orchestrators/test_modal_sandbox_executor.py new file mode 100644 index 00000000000..44f72116481 --- /dev/null +++ b/tests/integration/integrations/modal/orchestrators/test_modal_sandbox_executor.py @@ -0,0 +1,243 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Tests for ModalSandboxExecutor pipeline and step resource merging.""" + +from unittest.mock import Mock, patch +from uuid import uuid4 + +from zenml.config.resource_settings import ResourceSettings +from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorSettings, +) + + +class TestModalSandboxExecutorResourceMerging: + """Test resource merging between pipeline and step settings.""" + + def setup_method(self): + """Set up test fixtures.""" + # Mock Modal library + self.modal_patcher = patch( + "zenml.integrations.modal.orchestrators.modal_sandbox_executor.modal" + ) + self.modal_patcher.start() + + # Mock utils functions + self.utils_patcher = patch( + "zenml.integrations.modal.orchestrators.modal_sandbox_executor.get_resource_settings_from_deployment" + ) + self.mock_get_resource_settings = self.utils_patcher.start() + + # Create mock deployment + self.mock_deployment = Mock() + self.mock_deployment.id = uuid4() + self.mock_deployment.build = Mock() + self.mock_deployment.build.id = uuid4() + self.mock_deployment.pipeline_configuration = Mock() + self.mock_deployment.pipeline_configuration.name = "test_pipeline" + + # Pipeline-level settings + self.pipeline_settings = ModalOrchestratorSettings( + gpu="A100", timeout=3600, cloud="aws", region="us-east-1" + ) + + def teardown_method(self): + """Clean up after each test method.""" + self.modal_patcher.stop() + self.utils_patcher.stop() + + def _create_executor(self, **kwargs): + """Helper to create ModalSandboxExecutor with mocked dependencies.""" + from zenml.integrations.modal.orchestrators.modal_sandbox_executor import ( + ModalSandboxExecutor, + ) + + return ModalSandboxExecutor( + deployment=self.mock_deployment, + stack=Mock(), + environment={}, + settings=self.pipeline_settings, + **kwargs, + ) + + def test_step_resources_override_pipeline_resources(self): + """Test that step-level resources override pipeline-level resources.""" + step_name = "test_step" + + # Step has specific resource requirements + step_resources = ResourceSettings( + cpu_count=8, memory="16GB", gpu_count=2 + ) + mock_step_config = Mock() + mock_step_config.resource_settings = step_resources + mock_step_config.settings = {} + + self.mock_deployment.step_configurations = { + step_name: Mock(config=mock_step_config) + } + + # Pipeline has different defaults + pipeline_resources = ResourceSettings( + cpu_count=4, memory="8GB", gpu_count=1 + ) + self.mock_get_resource_settings.return_value = pipeline_resources + + executor = self._create_executor() + + # Step should get its specific resources, not pipeline defaults + step_result = executor._get_resource_settings(step_name) + pipeline_result = executor._get_resource_settings(None) + + assert step_result == step_resources + assert pipeline_result == pipeline_resources + assert step_result.cpu_count == 8 # Step override + assert pipeline_result.cpu_count == 4 # Pipeline default + + def test_step_modal_settings_override_pipeline_settings(self): + """Test that step-level Modal settings override pipeline settings.""" + step_name = "test_step" + + # Step overrides GPU and region + step_modal_settings = Mock() + step_modal_settings.model_dump.return_value = { + "gpu": "V100", + "region": "us-west-2", + } + + mock_step_config = Mock() + mock_step_config.settings = {"orchestrator.modal": step_modal_settings} + + self.mock_deployment.step_configurations = { + step_name: Mock(config=mock_step_config) + } + + executor = self._create_executor() + result = executor._get_step_settings(step_name) + + # Step overrides should take precedence + assert result.gpu == "V100" # Step override + assert result.region == "us-west-2" # Step override + assert result.cloud == "aws" # Pipeline default (not overridden) + assert result.timeout == 3600 # Pipeline default (not overridden) + + def test_partial_step_overrides_preserve_pipeline_defaults(self): + """Test that partial step overrides preserve non-overridden pipeline settings.""" + step_name = "test_step" + + # Step only overrides GPU + step_modal_settings = Mock() + step_modal_settings.model_dump.return_value = {"gpu": "T4"} + + mock_step_config = Mock() + mock_step_config.settings = {"orchestrator.modal": step_modal_settings} + + self.mock_deployment.step_configurations = { + step_name: Mock(config=mock_step_config) + } + + executor = self._create_executor() + result = executor._get_step_settings(step_name) + + assert result.gpu == "T4" # Step override + assert result.cloud == "aws" # Pipeline default preserved + assert result.region == "us-east-1" # Pipeline default preserved + assert result.timeout == 3600 # Pipeline default preserved + + @patch( + "zenml.integrations.modal.orchestrators.modal_sandbox_executor.get_gpu_values" + ) + @patch( + "zenml.integrations.modal.orchestrators.modal_sandbox_executor.get_resource_values" + ) + def test_complete_resource_merging_integration( + self, mock_get_resource_values, mock_get_gpu_values + ): + """Integration test for complete resource merging between pipeline and step.""" + step_name = "integration_step" + + # Step has specific resources and Modal settings + step_resources = ResourceSettings( + cpu_count=16, memory="32GB", gpu_count=4 + ) + step_modal_settings = Mock() + step_modal_settings.model_dump.return_value = { + "gpu": "A100", + "cloud": "gcp", + "region": "us-central1", + } + + mock_step_config = Mock() + mock_step_config.resource_settings = step_resources + mock_step_config.settings = {"orchestrator.modal": step_modal_settings} + + self.mock_deployment.step_configurations = { + step_name: Mock(config=mock_step_config) + } + + # Mock utility function returns + mock_get_gpu_values.return_value = "A100:4" + mock_get_resource_values.return_value = (16, 32000) + + executor = self._create_executor() + + # Test resource configuration + gpu_values, cpu_count, memory_mb = executor._get_resource_config( + step_name + ) + + # Test step settings + step_settings = executor._get_step_settings(step_name) + + # Assert step resources are used + assert gpu_values == "A100:4" + assert cpu_count == 16 + assert memory_mb == 32000 + + # Assert step Modal settings override pipeline where specified + assert step_settings.gpu == "A100" # Step override + assert step_settings.cloud == "gcp" # Step override + assert step_settings.region == "us-central1" # Step override + assert ( + step_settings.timeout == 3600 + ) # Pipeline default (not overridden) + + # Verify utility functions called with step resources + mock_get_gpu_values.assert_called_once_with("A100", step_resources) + mock_get_resource_values.assert_called_once_with(step_resources) + + def test_fallback_to_pipeline_when_no_step_config(self): + """Test fallback to pipeline settings when step has no specific configuration.""" + step_name = "minimal_step" + + # Step has no resource settings or Modal settings + mock_step_config = Mock() + mock_step_config.resource_settings = None + mock_step_config.settings = {} + + self.mock_deployment.step_configurations = { + step_name: Mock(config=mock_step_config) + } + + executor = self._create_executor() + + # Should get empty ResourceSettings (fallback) + step_resources = executor._get_resource_settings(step_name) + assert isinstance(step_resources, ResourceSettings) + assert step_resources.cpu_count is None + + # Should get pipeline Modal settings (fallback) + step_settings = executor._get_step_settings(step_name) + assert step_settings.gpu == "A100" # Pipeline setting + assert step_settings.cloud == "aws" # Pipeline setting + assert step_settings.region == "us-east-1" # Pipeline setting From 15e27a86a599c8f0f3d3a69c41cd9ce0bb52cd8b Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 13 Jul 2025 21:39:00 +0200 Subject: [PATCH 61/77] Update logging messages to be more descriptive --- .../modal/orchestrators/modal_orchestrator.py | 9 +++--- .../modal_orchestrator_entrypoint.py | 29 +++++++++---------- .../orchestrators/modal_sandbox_executor.py | 18 +++++++----- src/zenml/integrations/modal/utils.py | 18 ++++++------ 4 files changed, 36 insertions(+), 38 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 351f35f6060..d83d8d4d29a 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -170,7 +170,9 @@ def prepare_or_run_pipeline( execution_mode = getattr( settings, "execution_mode", ModalExecutionMode.PIPELINE ) - logger.info(f"Using execution mode: {execution_mode}") + logger.info( + f"🚀 Executing pipeline with Modal ({execution_mode.lower()} mode)" + ) # Create sandbox executor executor = ModalSandboxExecutor( @@ -180,9 +182,6 @@ def prepare_or_run_pipeline( settings=settings, ) - # Execute pipeline using the executor - logger.info("Starting pipeline execution with Modal sandboxes") - try: synchronous = ( settings.synchronous @@ -203,5 +202,5 @@ def prepare_or_run_pipeline( logger.error(f"Pipeline execution failed: {e}") raise - logger.info("Pipeline execution completed successfully") + logger.info("✅ Pipeline execution completed successfully") return None diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 67b33008121..a57ad16f238 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -79,13 +79,13 @@ def run_step_on_modal( Raises: Exception: If the sandbox fails to execute. """ - logger.info(f"Running step '{step_name}' in Modal sandbox") + logger.info(f"▶️ Running step: {step_name}") try: asyncio.run(executor.execute_step(step_name)) - logger.info(f"Step {step_name} completed successfully") + logger.info(f"✅ Step completed: {step_name}") except Exception as e: - logger.error(f"Step {step_name} failed: {e}") + logger.error(f"❌ Step failed: {step_name} - {e}") raise @@ -116,7 +116,7 @@ async def prepare_shared_image_cache( ) from zenml.integrations.modal.utils import get_or_build_modal_image - logger.info("Preparing shared image cache for per-step execution") + logger.info("🔧 Preparing images for step execution") # Create shared Modal app pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") @@ -156,9 +156,9 @@ async def prepare_shared_image_cache( unique_images[step_cache_key] = step_image # Build all unique images - logger.info(f"Building {len(unique_images)} unique images for pipeline") + logger.debug(f"Building {len(unique_images)} unique images for pipeline") for cache_key, image_name in unique_images.items(): - logger.info(f"Building image: {cache_key} from {image_name}") + logger.debug(f"Building image: {cache_key} from {image_name}") try: built_image = get_or_build_modal_image( image_name=image_name, @@ -168,12 +168,12 @@ async def prepare_shared_image_cache( app=shared_app, ) image_cache[cache_key] = built_image - logger.info(f"Successfully cached image: {cache_key}") + logger.debug(f"Successfully cached image: {cache_key}") except Exception as e: logger.error(f"Failed to build image {cache_key}: {e}") raise - logger.info(f"Image cache prepared with {len(image_cache)} images") + logger.info(f"✅ Prepared {len(image_cache)} container images") return image_cache, shared_app @@ -183,7 +183,7 @@ def execute_pipeline_mode(args: argparse.Namespace) -> None: Args: args: Parsed command line arguments. """ - logger.info("Executing entire pipeline in single sandbox") + logger.debug("Executing entire pipeline in single sandbox") entrypoint_args = PipelineEntrypointConfiguration.get_entrypoint_arguments( deployment_id=args.deployment_id ) @@ -207,10 +207,7 @@ def execute_per_step_mode( pipeline_settings: Modal orchestrator settings. args: Parsed command line arguments. """ - logger.info("Executing pipeline with per-step sandboxes") - - # Prepare shared image cache and Modal app for all steps - logger.info("Pre-building images for step execution") + logger.debug("Executing pipeline with per-step sandboxes") shared_image_cache, shared_app = asyncio.run( prepare_shared_image_cache( deployment=deployment, @@ -252,7 +249,7 @@ def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: for step_name, step in deployment.step_configurations.items() } - logger.info(f"Executing {len(pipeline_dag)} steps with shared image cache") + logger.info(f"🚀 Executing {len(pipeline_dag)} pipeline steps") # Run using ThreadedDagRunner with optimized execution ThreadedDagRunner( @@ -330,7 +327,7 @@ def main() -> None: Raises: Exception: If the pipeline execution fails. """ - logger.info("Modal orchestrator sandbox started.") + logger.debug("Modal orchestrator sandbox started.") args = parse_args() os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = args.orchestrator_run_id @@ -371,7 +368,7 @@ def main() -> None: deployment, active_stack, environment, pipeline_settings, args ) - logger.info("Pipeline execution completed successfully") + logger.debug("Pipeline execution completed successfully") except Exception as e: logger.error(f"Pipeline execution failed: {e}") diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 91dc3211aa0..deccebb8302 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -455,8 +455,10 @@ async def _execute_sandbox( run_id=run_id, ) - logger.info(f"Creating sandbox for {execution_mode.lower()} execution") - logger.info(f"Sandbox tags: {tags}") + logger.debug( + f"Creating sandbox for {execution_mode.lower()} execution" + ) + logger.debug(f"Sandbox tags: {tags}") # Validate and prepare Modal API parameters modal_params = self._prepare_modal_api_params( @@ -488,7 +490,7 @@ async def _execute_sandbox( print(line, end="") await sb.wait.aio() else: - logger.info("Sandbox started asynchronously") + logger.debug("Sandbox started asynchronously") # Store the image ID for future caching after sandbox creation # The image should be hydrated after being used in sandbox creation @@ -514,7 +516,7 @@ async def _store_image_id(self, zenml_image: Any) -> None: create_if_missing=True, ) stored_id[image_name_key] = zenml_image.object_id - logger.info( + logger.debug( f"Stored Modal image ID for build {self.deployment.build.id}" ) else: @@ -571,13 +573,13 @@ def _get_cached_or_build_image( # Check shared cache first cache_key = self._get_image_cache_key(image_name, step_name) if cache_key in self.shared_image_cache: - logger.info( + logger.debug( f"Using cached Modal image for {step_name or 'pipeline'}: {cache_key}" ) return self.shared_image_cache[cache_key] # Fallback to existing image building logic - logger.info( + logger.debug( f"Building new Modal image for {step_name or 'pipeline'}: {image_name}" ) if self.deployment.build is None: @@ -634,7 +636,7 @@ async def execute_pipeline( run_id: The pipeline run ID. synchronous: Whether to wait for completion. """ - logger.info("Executing entire pipeline in single sandbox") + logger.debug("Executing entire pipeline in single sandbox") # Build entrypoint command command = ( @@ -670,7 +672,7 @@ async def execute_step(self, step_name: str) -> None: Args: step_name: Name of the step to execute. """ - logger.info(f"Executing step '{step_name}' in separate sandbox") + logger.debug(f"Executing step '{step_name}' in separate sandbox") # Build step entrypoint command command = StepEntrypointConfiguration.get_entrypoint_command() diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 43824319cfa..7aaa294e793 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -87,7 +87,7 @@ def setup_modal_client( # Set both token ID and secret os.environ["MODAL_TOKEN_ID"] = token_id os.environ["MODAL_TOKEN_SECRET"] = token_secret - logger.info("Using platform token ID and secret from config") + logger.debug("Using platform token ID and secret from config") logger.debug(f"Token ID starts with: {token_id[:5]}...") logger.debug(f"Token secret starts with: {token_secret[:5]}...") @@ -101,7 +101,7 @@ def setup_modal_client( # Only token ID provided os.environ["MODAL_TOKEN_ID"] = token_id - logger.info("Using platform token ID from config") + logger.debug("Using platform token ID from config") logger.warning( "Only token ID provided. Make sure MODAL_TOKEN_SECRET is set " "or platform authentication may fail." @@ -125,7 +125,7 @@ def setup_modal_client( logger.debug(f"Token secret starts with: {token_secret[:5]}...") else: - logger.info("Using default platform authentication (~/.modal.toml)") + logger.debug("Using default platform authentication (~/.modal.toml)") # Check if default auth exists modal_toml_path = os.path.expanduser("~/.modal.toml") if os.path.exists(modal_toml_path): @@ -262,8 +262,8 @@ def _build_modal_image_from_registry( "it is correctly configured." ) - logger.info("Building new Modal image") - logger.info(f"Base image: {image_name}") + logger.debug("Building new Modal image") + logger.debug(f"Base image: {image_name}") if docker_creds := stack.container_registry.credentials: docker_username, docker_password = docker_creds @@ -281,8 +281,8 @@ def _build_modal_image_from_registry( ) # Build Modal image from the ZenML-built image - logger.info(f"🔨 Building Modal image from base: {image_name}") - logger.info(f"Creating Modal image from base: {image_name}") + logger.debug(f"Building Modal image from base: {image_name}") + logger.debug(f"Creating Modal image from base: {image_name}") zenml_image = ( modal.Image.from_registry( image_name, secret=registry_secret @@ -324,7 +324,7 @@ def get_or_build_modal_image( if image_name_key in stored_id: image_id = stored_id[image_name_key] existing_image = modal.Image.from_id(image_id) - logger.info( + logger.debug( f"Using cached Modal image for build {build_id} in pipeline {pipeline_name}" ) return existing_image @@ -480,7 +480,7 @@ def get_resource_settings_from_deployment( memory="1024MB", gpu_count=0, ) - logger.info( + logger.debug( "No explicit pipeline-level resource settings found. " "Using sane defaults: %s CPU, %s memory, %s GPU", resource_settings.cpu_count, From 5f4c69a43be9bb558493423071c3ed2346eb9554 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 14 Jul 2025 23:11:29 +0200 Subject: [PATCH 62/77] Update src/zenml/integrations/modal/step_operators/modal_step_operator.py Co-authored-by: Michael Schuster --- .../integrations/modal/step_operators/modal_step_operator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/zenml/integrations/modal/step_operators/modal_step_operator.py b/src/zenml/integrations/modal/step_operators/modal_step_operator.py index b4c143eb684..094a121bb2c 100644 --- a/src/zenml/integrations/modal/step_operators/modal_step_operator.py +++ b/src/zenml/integrations/modal/step_operators/modal_step_operator.py @@ -77,8 +77,7 @@ def validator(self) -> Optional[StackValidator]: Returns: The stack validator. """ - validator: StackValidator = create_modal_stack_validator() - return validator + return create_modal_stack_validator() def get_docker_builds( self, deployment: "PipelineDeploymentBase" From bbbe9df0c81df2376f5a4c6989cf5bd8ec749918 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 14 Jul 2025 23:11:40 +0200 Subject: [PATCH 63/77] Update src/zenml/integrations/modal/orchestrators/modal_orchestrator.py Co-authored-by: Michael Schuster --- .../integrations/modal/orchestrators/modal_orchestrator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index d83d8d4d29a..5fbc301b8e8 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -97,8 +97,7 @@ def validator(self) -> Optional[StackValidator]: Returns: A `StackValidator` instance. """ - validator: StackValidator = create_modal_stack_validator() - return validator + return create_modal_stack_validator() def get_orchestrator_run_id(self) -> str: """Returns the active orchestrator run id. From 0208989cf7acde564e357838f6a2cedb6719589c Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 14 Jul 2025 23:21:27 +0200 Subject: [PATCH 64/77] Update execution mode setting to "mode" in Modal orchestrator --- .../component-guide/orchestrators/modal.md | 20 +++--- .../flavors/modal_orchestrator_flavor.py | 7 +- .../modal/orchestrators/modal_orchestrator.py | 64 +++++++++++++------ .../modal_orchestrator_entrypoint.py | 25 ++++---- ...l_orchestrator_entrypoint_configuration.py | 6 -- .../orchestrators/modal_sandbox_executor.py | 3 - 6 files changed, 74 insertions(+), 51 deletions(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index 171a8d75cc4..46ee0c78b61 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -159,7 +159,7 @@ The Modal orchestrator uses two types of settings following ZenML's standard pat - `region` - Cloud region preference - `cloud` - Cloud provider selection - `modal_environment` - Modal environment name (e.g., "main", "dev", "prod") - - `execution_mode` - Execution strategy: "pipeline" (default) or "per_step" + - `mode` - Execution strategy: "pipeline" (default) or "per_step" - `max_parallelism` - Maximum concurrent steps (for "per_step" mode) - `timeout` - Maximum execution time in seconds - `synchronous` - Wait for completion (True) or fire-and-forget (False) @@ -202,7 +202,7 @@ modal_settings = ModalOrchestratorSettings( region="us-east-1", # Preferred region cloud="aws", # Cloud provider modal_environment="production", # Modal environment name - execution_mode="pipeline", # "pipeline" (default) or "per_step" + mode="pipeline", # "pipeline" (default) or "per_step" max_parallelism=3, # Max concurrent steps (per_step mode) timeout=3600, # 1 hour timeout synchronous=True, # Wait for completion @@ -284,7 +284,7 @@ The Modal orchestrator supports two execution modes: ```python modal_settings = ModalOrchestratorSettings( - execution_mode="pipeline", # Execute entire pipeline in one sandbox + mode="pipeline", # Execute entire pipeline in one sandbox gpu="A100" ) ``` @@ -300,7 +300,7 @@ modal_settings = ModalOrchestratorSettings( ```python modal_settings = ModalOrchestratorSettings( - execution_mode="per_step", # Execute each step in separate sandbox + mode="per_step", # Execute each step in separate sandbox max_parallelism=3, # Run up to 3 steps concurrently gpu="T4" # Default GPU for steps (can be overridden per step) ) @@ -321,7 +321,7 @@ In per-step mode, you can configure different resources for each step, enabling @pipeline( settings={ "orchestrator": ModalOrchestratorSettings( - execution_mode="per_step", + mode="per_step", max_parallelism=2, gpu="T4" # Default GPU for steps ) @@ -460,7 +460,7 @@ Available GPU types include: settings={ "orchestrator": ModalOrchestratorSettings( gpu="A100", - execution_mode="pipeline" # Default: entire pipeline in one sandbox + mode="pipeline" # Default: entire pipeline in one sandbox ), "resources": ResourceSettings(gpu_count=1) } @@ -475,7 +475,7 @@ def simple_gpu_pipeline(): settings={ "orchestrator": ModalOrchestratorSettings( gpu="A100", - execution_mode="per_step", # Each step in separate sandbox + mode="per_step", # Each step in separate sandbox max_parallelism=2 # Run up to 2 steps concurrently ), "resources": ResourceSettings(gpu_count=4) @@ -598,12 +598,12 @@ This ensures your pipelines start executing quickly by reusing persistent apps a ```python # Cost-effective: Pipeline mode # Single A100 GPU for 30-minute pipeline = ~$0.80 -ModalOrchestratorSettings(execution_mode="pipeline", gpu="A100") +ModalOrchestratorSettings(mode="pipeline", gpu="A100") # Higher cost: Per-step mode # A100 GPU per step (5 steps × 6 min each) = ~$0.80 # But steps can run in parallel, reducing total time -ModalOrchestratorSettings(execution_mode="per_step", gpu="A100") +ModalOrchestratorSettings(mode="per_step", gpu="A100") ``` **Cost Optimization Strategies**: @@ -627,7 +627,7 @@ Monitor your Modal dashboard to track sandbox execution time and resource usage. ```python modal_settings = ModalOrchestratorSettings( timeout=7200, # 2 hours - execution_mode="pipeline" # Recommended for most cases + mode="pipeline" # Recommended for most cases ) ``` diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index ae0c2b1aa7c..877218faf7b 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -49,7 +49,10 @@ class ModalOrchestratorSettings(BaseSettings): cloud: The cloud provider to use for the pipeline execution. modal_environment: The Modal environment to use for the pipeline execution. timeout: Maximum execution time in seconds (default 24h). - execution_mode: Execution mode - PIPELINE (fastest) or PER_STEP (granular). + mode: Execution mode controlling sandbox allocation. PIPELINE mode runs the + entire pipeline in a single Modal sandbox (fastest, shared resources). + PER_STEP mode runs each step in its own sandbox (granular control, + step-specific resources, better for debugging and resource isolation). max_parallelism: Maximum number of parallel sandboxes (for PER_STEP mode). synchronous: Wait for completion (True) or fire-and-forget (False). """ @@ -59,7 +62,7 @@ class ModalOrchestratorSettings(BaseSettings): cloud: Optional[str] = None modal_environment: Optional[str] = None timeout: int = 86400 # 24 hours (Modal's maximum) - execution_mode: ModalExecutionMode = ( + mode: ModalExecutionMode = ( ModalExecutionMode.PIPELINE ) # Default to fastest mode max_parallelism: Optional[int] = ( diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index d83d8d4d29a..72c65b9588e 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -19,6 +19,7 @@ TYPE_CHECKING, Dict, Iterator, + List, Optional, Type, cast, @@ -26,6 +27,8 @@ from uuid import uuid4 from zenml.config.base_settings import BaseSettings +from zenml.config.build_configuration import BuildConfiguration +from zenml.constants import ORCHESTRATOR_DOCKER_IMAGE_KEY from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( ModalExecutionMode, ) @@ -47,7 +50,7 @@ ModalOrchestratorConfig, ModalOrchestratorSettings, ) - from zenml.models import PipelineDeploymentResponse, PipelineRunResponse + from zenml.models import PipelineDeploymentBase, PipelineDeploymentResponse, PipelineRunResponse logger = get_logger(__name__) @@ -81,6 +84,46 @@ def settings_class(self) -> Optional[Type["BaseSettings"]]: return ModalOrchestratorSettings + def get_docker_builds( + self, deployment: "PipelineDeploymentBase" + ) -> List["BuildConfiguration"]: + """Gets the Docker builds required for the component. + + For Modal orchestrator in PIPELINE mode, per-step images are not allowed + since the entire pipeline runs in a single sandbox. + + Args: + deployment: The pipeline deployment for which to get the builds. + + Returns: + The required Docker builds. + + Raises: + ValueError: If PIPELINE mode is used with per-step Docker settings. + """ + builds = super().get_docker_builds(deployment) + + # Get the execution mode from settings + settings = cast( + "ModalOrchestratorSettings", self.get_settings(deployment) + ) + execution_mode = settings.mode + + # In PIPELINE mode, check if any builds have step-specific configurations + if execution_mode == ModalExecutionMode.PIPELINE: + for build in builds: + if (build.key == ORCHESTRATOR_DOCKER_IMAGE_KEY and + build.step_name is not None): + raise ValueError( + f"Per-step Docker settings are not supported in PIPELINE " + f"execution mode. Step '{build.step_name}' has custom Docker " + f"settings but will be ignored since the entire pipeline runs " + f"in a single sandbox. Either use PER_STEP execution mode or " + f"remove step-specific Docker settings." + ) + + return builds + def _setup_modal_client(self) -> None: """Setup Modal client with authentication.""" setup_modal_client( @@ -140,36 +183,21 @@ def prepare_or_run_pipeline( Returns: None if the pipeline is executed synchronously, otherwise an iterator of metadata dictionaries. """ - if deployment.schedule: - logger.warning( - "Modal Orchestrator currently does not support the " - "use of schedules. The `schedule` will be ignored " - "and the pipeline will be run immediately." - ) - # Setup Modal authentication self._setup_modal_client() - # Generate orchestrator run ID and include pipeline run ID for isolation - orchestrator_run_id = str(uuid4()) - environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id - # Pass pipeline run ID for proper isolation if placeholder_run: environment["ZENML_PIPELINE_RUN_ID"] = str(placeholder_run.id) logger.debug(f"Pipeline run ID: {placeholder_run.id}") - logger.debug(f"Orchestrator run ID: {orchestrator_run_id}") - # Get settings from pipeline configuration settings = cast( "ModalOrchestratorSettings", self.get_settings(deployment) ) # Check execution mode - execution_mode = getattr( - settings, "execution_mode", ModalExecutionMode.PIPELINE - ) + execution_mode = settings.mode logger.info( f"🚀 Executing pipeline with Modal ({execution_mode.lower()} mode)" ) @@ -191,7 +219,6 @@ def prepare_or_run_pipeline( asyncio.run( executor.execute_pipeline( - orchestrator_run_id=orchestrator_run_id, run_id=str(placeholder_run.id) if placeholder_run else None, @@ -202,5 +229,4 @@ def prepare_or_run_pipeline( logger.error(f"Pipeline execution failed: {e}") raise - logger.info("✅ Pipeline execution completed successfully") return None diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index a57ad16f238..c1a1ec6cc40 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -17,7 +17,7 @@ import asyncio import os from typing import TYPE_CHECKING, Any, Dict, cast -from uuid import UUID +from uuid import UUID, uuid4 import modal @@ -61,7 +61,6 @@ def parse_args() -> argparse.Namespace: """ parser = argparse.ArgumentParser() parser.add_argument("--deployment_id", type=str, required=True) - parser.add_argument("--orchestrator_run_id", type=str, required=True) parser.add_argument("--run_id", type=str, required=False) return parser.parse_args() @@ -197,6 +196,7 @@ def execute_per_step_mode( environment: Dict[str, str], pipeline_settings: ModalOrchestratorSettings, args: argparse.Namespace, + orchestrator_run_id: str, ) -> None: """Execute pipeline with per-step sandboxes. @@ -206,6 +206,7 @@ def execute_per_step_mode( environment: Environment variables. pipeline_settings: Modal orchestrator settings. args: Parsed command line arguments. + orchestrator_run_id: The orchestrator run ID. """ logger.debug("Executing pipeline with per-step sandboxes") shared_image_cache, shared_app = asyncio.run( @@ -241,7 +242,7 @@ def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: node_states: Mapping of node/step names to their execution status after DAG completion. """ - finalize_run(node_states, args) + finalize_run(node_states, args, orchestrator_run_id) # Build DAG from deployment pipeline_dag = { @@ -261,13 +262,14 @@ def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: def finalize_run( - node_states: Dict[str, NodeStatus], args: argparse.Namespace + node_states: Dict[str, NodeStatus], args: argparse.Namespace, orchestrator_run_id: str ) -> None: """Finalize the run by updating step and pipeline run statuses. Args: node_states: The states of the nodes. args: Parsed command line arguments. + orchestrator_run_id: The orchestrator run ID. """ try: client = Client() @@ -278,7 +280,7 @@ def finalize_run( if args.run_id: list_args = dict(id=UUID(args.run_id)) else: - list_args = dict(orchestrator_run_id=args.orchestrator_run_id) + list_args = dict(orchestrator_run_id=orchestrator_run_id) pipeline_runs = client.list_pipeline_runs( hydrate=True, @@ -330,7 +332,10 @@ def main() -> None: logger.debug("Modal orchestrator sandbox started.") args = parse_args() - os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = args.orchestrator_run_id + + # Generate orchestrator run ID locally since it's just a random UUID + orchestrator_run_id = str(uuid4()) + os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id client = Client() active_stack = client.active_stack @@ -352,12 +357,10 @@ def main() -> None: ) environment = get_config_environment_vars() - environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = args.orchestrator_run_id + environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id # Check execution mode - execution_mode = getattr( - pipeline_settings, "execution_mode", ModalExecutionMode.PIPELINE - ) + execution_mode = pipeline_settings.mode try: # Execute pipeline based on execution mode @@ -365,7 +368,7 @@ def main() -> None: execute_pipeline_mode(args) else: execute_per_step_mode( - deployment, active_stack, environment, pipeline_settings, args + deployment, active_stack, environment, pipeline_settings, args, orchestrator_run_id ) logger.debug("Pipeline execution completed successfully") diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py index db0f478f631..f638f702819 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint_configuration.py @@ -19,7 +19,6 @@ from uuid import UUID DEPLOYMENT_ID_OPTION = "deployment_id" -ORCHESTRATOR_RUN_ID_OPTION = "orchestrator_run_id" RUN_ID_OPTION = "run_id" @@ -35,7 +34,6 @@ def get_entrypoint_options(cls) -> Set[str]: """ options = { DEPLOYMENT_ID_OPTION, - ORCHESTRATOR_RUN_ID_OPTION, } return options @@ -57,14 +55,12 @@ def get_entrypoint_command(cls) -> List[str]: def get_entrypoint_arguments( cls, deployment_id: "UUID", - orchestrator_run_id: str, run_id: Optional["UUID"] = None, ) -> List[str]: """Gets all arguments that the entrypoint command should be called with. Args: deployment_id: ID of the deployment. - orchestrator_run_id: ID of the orchestrator run. run_id: Optional ID of the pipeline run. Returns: @@ -73,8 +69,6 @@ def get_entrypoint_arguments( args = [ f"--{DEPLOYMENT_ID_OPTION}", str(deployment_id), - f"--{ORCHESTRATOR_RUN_ID_OPTION}", - orchestrator_run_id, ] if run_id: diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index deccebb8302..69ee42eb242 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -625,14 +625,12 @@ def _get_image_cache_key( async def execute_pipeline( self, - orchestrator_run_id: str, run_id: Optional[str] = None, synchronous: bool = True, ) -> None: """Execute the entire pipeline in a single sandbox. Args: - orchestrator_run_id: The orchestrator run ID. run_id: The pipeline run ID. synchronous: Whether to wait for completion. """ @@ -652,7 +650,6 @@ async def execute_pipeline( args = ( ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( deployment_id=self.deployment.id, - orchestrator_run_id=orchestrator_run_id, run_id=run_id_uuid, ) ) From 83d1bbc719e087b71319ee1f5d5953c419e953a8 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 14 Jul 2025 23:21:49 +0200 Subject: [PATCH 65/77] Refactor Modal orchestrator code for better readability --- .../flavors/modal_orchestrator_flavor.py | 6 ++--- .../modal/orchestrators/modal_orchestrator.py | 23 +++++++++++-------- .../modal_orchestrator_entrypoint.py | 13 ++++++++--- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 877218faf7b..7d02ec5d521 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -49,9 +49,9 @@ class ModalOrchestratorSettings(BaseSettings): cloud: The cloud provider to use for the pipeline execution. modal_environment: The Modal environment to use for the pipeline execution. timeout: Maximum execution time in seconds (default 24h). - mode: Execution mode controlling sandbox allocation. PIPELINE mode runs the - entire pipeline in a single Modal sandbox (fastest, shared resources). - PER_STEP mode runs each step in its own sandbox (granular control, + mode: Execution mode controlling sandbox allocation. PIPELINE mode runs the + entire pipeline in a single Modal sandbox (fastest, shared resources). + PER_STEP mode runs each step in its own sandbox (granular control, step-specific resources, better for debugging and resource isolation). max_parallelism: Maximum number of parallel sandboxes (for PER_STEP mode). synchronous: Wait for completion (True) or fire-and-forget (False). diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 72c65b9588e..b4353246c3b 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -24,7 +24,6 @@ Type, cast, ) -from uuid import uuid4 from zenml.config.base_settings import BaseSettings from zenml.config.build_configuration import BuildConfiguration @@ -50,7 +49,11 @@ ModalOrchestratorConfig, ModalOrchestratorSettings, ) - from zenml.models import PipelineDeploymentBase, PipelineDeploymentResponse, PipelineRunResponse + from zenml.models import ( + PipelineDeploymentBase, + PipelineDeploymentResponse, + PipelineRunResponse, + ) logger = get_logger(__name__) @@ -88,7 +91,7 @@ def get_docker_builds( self, deployment: "PipelineDeploymentBase" ) -> List["BuildConfiguration"]: """Gets the Docker builds required for the component. - + For Modal orchestrator in PIPELINE mode, per-step images are not allowed since the entire pipeline runs in a single sandbox. @@ -97,23 +100,25 @@ def get_docker_builds( Returns: The required Docker builds. - + Raises: ValueError: If PIPELINE mode is used with per-step Docker settings. """ builds = super().get_docker_builds(deployment) - + # Get the execution mode from settings settings = cast( "ModalOrchestratorSettings", self.get_settings(deployment) ) execution_mode = settings.mode - + # In PIPELINE mode, check if any builds have step-specific configurations if execution_mode == ModalExecutionMode.PIPELINE: for build in builds: - if (build.key == ORCHESTRATOR_DOCKER_IMAGE_KEY and - build.step_name is not None): + if ( + build.key == ORCHESTRATOR_DOCKER_IMAGE_KEY + and build.step_name is not None + ): raise ValueError( f"Per-step Docker settings are not supported in PIPELINE " f"execution mode. Step '{build.step_name}' has custom Docker " @@ -121,7 +126,7 @@ def get_docker_builds( f"in a single sandbox. Either use PER_STEP execution mode or " f"remove step-specific Docker settings." ) - + return builds def _setup_modal_client(self) -> None: diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index c1a1ec6cc40..e30a8896a7c 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -262,7 +262,9 @@ def finalize_wrapper(node_states: Dict[str, NodeStatus]) -> None: def finalize_run( - node_states: Dict[str, NodeStatus], args: argparse.Namespace, orchestrator_run_id: str + node_states: Dict[str, NodeStatus], + args: argparse.Namespace, + orchestrator_run_id: str, ) -> None: """Finalize the run by updating step and pipeline run statuses. @@ -332,7 +334,7 @@ def main() -> None: logger.debug("Modal orchestrator sandbox started.") args = parse_args() - + # Generate orchestrator run ID locally since it's just a random UUID orchestrator_run_id = str(uuid4()) os.environ[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id @@ -368,7 +370,12 @@ def main() -> None: execute_pipeline_mode(args) else: execute_per_step_mode( - deployment, active_stack, environment, pipeline_settings, args, orchestrator_run_id + deployment, + active_stack, + environment, + pipeline_settings, + args, + orchestrator_run_id, ) logger.debug("Pipeline execution completed successfully") From bb9790b5e56c5e40e87c56be4286b6a4c52e1f23 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 14 Jul 2025 23:26:41 +0200 Subject: [PATCH 66/77] Add Docker configuration validation for Modal orchestrator --- docs/book/component-guide/orchestrators/modal.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index 46ee0c78b61..8b46ff8a4d3 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -21,12 +21,13 @@ You should use the Modal orchestrator if: * you want cost-effective ML pipeline orchestration without managing infrastructure. * you need easy access to GPUs and high-performance computing resources. * you prefer a simple setup process without complex Kubernetes configurations. +* you need flexibility between fast pipeline execution (PIPELINE mode) and step-level resource isolation (PER_STEP mode). ## When NOT to use it The Modal orchestrator may not be the best choice if: -* **You need fine-grained step isolation**: Modal orchestrator runs entire pipelines in single sandboxes, which means all steps share the same resources and environment. For pipelines requiring different resource configurations per step, consider the [Modal step operator](../step-operators/modal.md) instead. +* **You need extremely fine-grained control beyond per-step isolation**: While the Modal orchestrator supports two execution modes (PIPELINE mode for speed and PER_STEP mode for step isolation), if you need even more granular control over individual step execution environments, consider the [Modal step operator](../step-operators/modal.md) instead. * **You have strict data locality requirements**: Modal runs in specific cloud regions and may not be suitable if you need to keep data processing within specific geographic boundaries or on-premises. @@ -372,6 +373,16 @@ def evaluate_model(): - **Parallel execution**: Steps with different resources can run concurrently - **Flexibility**: Each step gets exactly the resources it needs +{% hint style="info" %} +**Docker Configuration Validation** + +The Modal orchestrator validates Docker configurations based on execution mode: + +- **PIPELINE mode**: Per-step Docker settings are not allowed since all steps run in the same sandbox. If you have step-specific Docker configurations, you'll get an error suggesting to either use PER_STEP mode or remove the step-specific settings. + +- **PER_STEP mode**: Per-step Docker settings are fully supported and each step can have its own Docker configuration. +{% endhint %} + ### Sandbox Architecture The Modal orchestrator uses a simplified sandbox-based architecture: From 5b09c58ad0b17dc5fb5e784a8d2bbd2161253702 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 14 Jul 2025 23:27:59 +0200 Subject: [PATCH 67/77] Refactor variable names for execution mode in ModalOrchestrator --- .../modal/orchestrators/modal_orchestrator.py | 8 ++++---- .../orchestrators/modal_orchestrator_entrypoint.py | 4 ++-- .../modal/orchestrators/modal_sandbox_executor.py | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 8a321598dcc..fef0079a506 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -110,10 +110,10 @@ def get_docker_builds( settings = cast( "ModalOrchestratorSettings", self.get_settings(deployment) ) - execution_mode = settings.mode + mode = settings.mode # In PIPELINE mode, check if any builds have step-specific configurations - if execution_mode == ModalExecutionMode.PIPELINE: + if mode == ModalExecutionMode.PIPELINE: for build in builds: if ( build.key == ORCHESTRATOR_DOCKER_IMAGE_KEY @@ -201,9 +201,9 @@ def prepare_or_run_pipeline( ) # Check execution mode - execution_mode = settings.mode + mode = settings.mode logger.info( - f"🚀 Executing pipeline with Modal ({execution_mode.lower()} mode)" + f"🚀 Executing pipeline with Modal ({mode.lower()} mode)" ) # Create sandbox executor diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index e30a8896a7c..1456af6478e 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -362,11 +362,11 @@ def main() -> None: environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id # Check execution mode - execution_mode = pipeline_settings.mode + mode = pipeline_settings.mode try: # Execute pipeline based on execution mode - if execution_mode == ModalExecutionMode.PIPELINE: + if mode == ModalExecutionMode.PIPELINE: execute_pipeline_mode(args) else: execute_per_step_mode( diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 69ee42eb242..bdc36195971 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -411,7 +411,7 @@ def _prepare_modal_api_params( async def _execute_sandbox( self, entrypoint_command: List[str], - execution_mode: str, + mode: str, step_name: Optional[str] = None, run_id: Optional[str] = None, synchronous: bool = True, @@ -420,7 +420,7 @@ async def _execute_sandbox( Args: entrypoint_command: Command to execute in the sandbox. - execution_mode: Execution mode for tagging. + mode: Execution mode for tagging. step_name: Name of the step (for step execution). run_id: Pipeline run ID for tagging. synchronous: Whether to wait for completion. @@ -450,13 +450,13 @@ async def _execute_sandbox( tags = generate_sandbox_tags( pipeline_name=self.deployment.pipeline_configuration.name, deployment_id=str(self.deployment.id), - execution_mode=execution_mode, + execution_mode=mode, step_name=step_name, run_id=run_id, ) logger.debug( - f"Creating sandbox for {execution_mode.lower()} execution" + f"Creating sandbox for {mode.lower()} execution" ) logger.debug(f"Sandbox tags: {tags}") @@ -658,7 +658,7 @@ async def execute_pipeline( # Execute pipeline sandbox await self._execute_sandbox( entrypoint_command=entrypoint_command, - execution_mode="PIPELINE", + mode="PIPELINE", run_id=run_id, synchronous=synchronous, ) @@ -681,7 +681,7 @@ async def execute_step(self, step_name: str) -> None: # Execute step sandbox await self._execute_sandbox( entrypoint_command=entrypoint_command, - execution_mode="PER_STEP", + mode="PER_STEP", step_name=step_name, synchronous=True, # Steps are always synchronous ) From 2e618d72a481d5c2980b77c8588307885a2e2bf3 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 15 Jul 2025 08:59:20 +0200 Subject: [PATCH 68/77] Refactor log statements for better readability --- .../integrations/modal/orchestrators/modal_orchestrator.py | 4 +--- .../modal/orchestrators/modal_sandbox_executor.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index fef0079a506..3952f5e696b 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -202,9 +202,7 @@ def prepare_or_run_pipeline( # Check execution mode mode = settings.mode - logger.info( - f"🚀 Executing pipeline with Modal ({mode.lower()} mode)" - ) + logger.info(f"🚀 Executing pipeline with Modal ({mode.lower()} mode)") # Create sandbox executor executor = ModalSandboxExecutor( diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index bdc36195971..05d6e7ace35 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -455,9 +455,7 @@ async def _execute_sandbox( run_id=run_id, ) - logger.debug( - f"Creating sandbox for {mode.lower()} execution" - ) + logger.debug(f"Creating sandbox for {mode.lower()} execution") logger.debug(f"Sandbox tags: {tags}") # Validate and prepare Modal API parameters From 44121d100d23ac4c0486bba828f084e8005580e5 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 15 Jul 2025 09:17:39 +0200 Subject: [PATCH 69/77] Update ModalOrchestrator to submit pipelines to Modal --- .../modal/orchestrators/modal_orchestrator.py | 81 ++++++++++++------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 3952f5e696b..f2c0d5060b3 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -18,7 +18,6 @@ from typing import ( TYPE_CHECKING, Dict, - Iterator, List, Optional, Type, @@ -40,8 +39,7 @@ setup_modal_client, ) from zenml.logger import get_logger -from zenml.metadata.metadata_types import MetadataType -from zenml.orchestrators import ContainerizedOrchestrator +from zenml.orchestrators import ContainerizedOrchestrator, SubmissionResult from zenml.stack import Stack, StackValidator if TYPE_CHECKING: @@ -165,27 +163,31 @@ def get_orchestrator_run_id(self) -> str: f"{ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID}." ) - def prepare_or_run_pipeline( + def submit_pipeline( self, deployment: "PipelineDeploymentResponse", stack: "Stack", environment: Dict[str, str], placeholder_run: Optional["PipelineRunResponse"] = None, - ) -> Optional[Iterator[Dict[str, MetadataType]]]: - """Runs the complete pipeline using Modal sandboxes. + ) -> Optional[SubmissionResult]: + """Submits a pipeline to Modal for execution. + + This method submits the pipeline to Modal and returns immediately unless + synchronous execution is configured, in which case it provides a wait + function in the submission result. Args: - deployment: The pipeline deployment to prepare or run. + deployment: The pipeline deployment to submit. stack: The stack the pipeline will run on. environment: Environment variables to set in the orchestration environment. placeholder_run: An optional placeholder run for the deployment. Raises: - Exception: If pipeline execution fails. + Exception: If pipeline submission fails. Returns: - None if the pipeline is executed synchronously, otherwise an iterator of metadata dictionaries. + Optional submission result with wait function if synchronous. """ # Setup Modal authentication self._setup_modal_client() @@ -212,23 +214,48 @@ def prepare_or_run_pipeline( settings=settings, ) - try: - synchronous = ( - settings.synchronous - if hasattr(settings, "synchronous") - else self.config.synchronous - ) + # Determine if we should wait for completion + synchronous = ( + settings.synchronous + if hasattr(settings, "synchronous") + else self.config.synchronous + ) - asyncio.run( - executor.execute_pipeline( - run_id=str(placeholder_run.id) - if placeholder_run - else None, - synchronous=synchronous, - ) - ) - except Exception as e: - logger.error(f"Pipeline execution failed: {e}") - raise + # Submit the pipeline + run_id = str(placeholder_run.id) if placeholder_run else None + + # Execute the pipeline based on synchronous setting + if synchronous: + # Return a wait function that will execute the pipeline when called + def _wait_for_completion() -> None: + async def _execute_pipeline() -> None: + try: + await executor.execute_pipeline( + run_id=run_id, + synchronous=True, # Wait for completion + ) + logger.info( + "✅ Pipeline execution completed successfully" + ) + except Exception as e: + logger.error(f"Pipeline execution failed: {e}") + raise + + asyncio.run(_execute_pipeline()) + + return SubmissionResult(wait_for_completion=_wait_for_completion) + else: + # Fire and forget - execute the pipeline asynchronously + async def _execute_pipeline() -> None: + try: + await executor.execute_pipeline( + run_id=run_id, + synchronous=False, # Don't wait for completion + ) + logger.info("✅ Pipeline submitted successfully") + except Exception as e: + logger.error(f"Pipeline submission failed: {e}") + raise - return None + asyncio.run(_execute_pipeline()) + return None From 099264e1c7e1df11f5542cbf1f07507aff3d4fe2 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 15 Jul 2025 10:01:03 +0200 Subject: [PATCH 70/77] Refactor method signature in ModalOrchestrator class --- .../integrations/modal/orchestrators/modal_orchestrator.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index f2c0d5060b3..54afd91c9bc 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -183,9 +183,6 @@ def submit_pipeline( environment. placeholder_run: An optional placeholder run for the deployment. - Raises: - Exception: If pipeline submission fails. - Returns: Optional submission result with wait function if synchronous. """ From 8d6bfeccbab812b783d079cd97e22f428c13fad5 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 15 Jul 2025 13:46:26 +0200 Subject: [PATCH 71/77] Add custom app name option for Modal orchestrator --- .../component-guide/orchestrators/modal.md | 2 + .../flavors/modal_orchestrator_flavor.py | 89 +++++++++++++++---- .../modal_orchestrator_entrypoint.py | 4 +- .../orchestrators/modal_sandbox_executor.py | 6 +- src/zenml/integrations/modal/utils.py | 30 ++++++- 5 files changed, 109 insertions(+), 22 deletions(-) diff --git a/docs/book/component-guide/orchestrators/modal.md b/docs/book/component-guide/orchestrators/modal.md index 8b46ff8a4d3..4516ccc229e 100644 --- a/docs/book/component-guide/orchestrators/modal.md +++ b/docs/book/component-guide/orchestrators/modal.md @@ -160,6 +160,7 @@ The Modal orchestrator uses two types of settings following ZenML's standard pat - `region` - Cloud region preference - `cloud` - Cloud provider selection - `modal_environment` - Modal environment name (e.g., "main", "dev", "prod") + - `app_name` - Custom Modal app name (defaults to pipeline name) - `mode` - Execution strategy: "pipeline" (default) or "per_step" - `max_parallelism` - Maximum concurrent steps (for "per_step" mode) - `timeout` - Maximum execution time in seconds @@ -203,6 +204,7 @@ modal_settings = ModalOrchestratorSettings( region="us-east-1", # Preferred region cloud="aws", # Cloud provider modal_environment="production", # Modal environment name + app_name="ml-training-prod", # Custom Modal app name (optional) mode="pipeline", # "pipeline" (default) or "per_step" max_parallelism=3, # Max concurrent steps (per_step mode) timeout=3600, # 1 hour timeout diff --git a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py index 7d02ec5d521..246f6046d89 100644 --- a/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_orchestrator_flavor.py @@ -16,6 +16,8 @@ from enum import Enum from typing import TYPE_CHECKING, Optional, Type +from pydantic import Field + from zenml.config.base_settings import BaseSettings from zenml.orchestrators import BaseOrchestratorConfig, BaseOrchestratorFlavor from zenml.utils.secret_utils import SecretField @@ -48,6 +50,7 @@ class ModalOrchestratorSettings(BaseSettings): region: The region to use for the pipeline execution. cloud: The cloud provider to use for the pipeline execution. modal_environment: The Modal environment to use for the pipeline execution. + app_name: Custom name for the Modal app (defaults to pipeline name). timeout: Maximum execution time in seconds (default 24h). mode: Execution mode controlling sandbox allocation. PIPELINE mode runs the entire pipeline in a single Modal sandbox (fastest, shared resources). @@ -57,19 +60,60 @@ class ModalOrchestratorSettings(BaseSettings): synchronous: Wait for completion (True) or fire-and-forget (False). """ - gpu: Optional[str] = None - region: Optional[str] = None - cloud: Optional[str] = None - modal_environment: Optional[str] = None - timeout: int = 86400 # 24 hours (Modal's maximum) - mode: ModalExecutionMode = ( - ModalExecutionMode.PIPELINE - ) # Default to fastest mode - max_parallelism: Optional[int] = ( - None # Maximum number of parallel sandboxes (for PER_STEP mode) + gpu: Optional[str] = Field( + None, + description="GPU type for pipeline execution. Must be a valid Modal GPU type. " + "Examples: 'T4' (cost-effective), 'A100' (high-performance), 'V100' (training workloads). " + "Use ResourceSettings.gpu_count to specify number of GPUs. If not specified, uses CPU-only execution", + ) + region: Optional[str] = Field( + None, + description="Cloud region for pipeline execution. Must be a valid region for the selected cloud provider. " + "Examples: 'us-east-1', 'us-west-2', 'eu-west-1'. If not specified, Modal uses default region " + "based on cloud provider and availability", + ) + cloud: Optional[str] = Field( + None, + description="Cloud provider for pipeline execution. Must be a valid Modal-supported cloud provider. " + "Examples: 'aws', 'gcp'. If not specified, Modal uses default cloud provider " + "based on workspace configuration", + ) + modal_environment: Optional[str] = Field( + None, + description="Modal environment name for pipeline execution. Must be a valid environment " + "configured in your Modal workspace. Examples: 'main', 'staging', 'production'. " + "If not specified, uses the default environment for the workspace", + ) + app_name: Optional[str] = Field( + None, + description="Specifies custom name for the Modal app used for pipeline execution. " + "Must be a valid Modal app name containing only alphanumeric characters, " + "hyphens, and underscores. Examples: 'ml-training-app', 'data_pipeline_prod', " + "'zenml-experiments'. If not provided, defaults to 'zenml-pipeline-{pipeline_name}'", + ) + timeout: int = Field( + 86400, + description="Maximum execution time in seconds for pipeline completion. Must be between 1 and 86400 seconds. " + "Examples: 3600 (1 hour), 7200 (2 hours), 86400 (24 hours maximum). " + "Pipeline execution will be terminated if it exceeds this timeout", ) - synchronous: bool = ( - True # Wait for completion (True) or fire-and-forget (False) + mode: ModalExecutionMode = Field( + ModalExecutionMode.PIPELINE, + description="Execution mode controlling sandbox allocation strategy. PIPELINE mode runs entire pipeline " + "in single Modal sandbox for fastest execution with shared resources. PER_STEP mode runs each step " + "in separate sandbox for granular control and resource isolation. Examples: 'pipeline', 'per_step'", + ) + max_parallelism: Optional[int] = Field( + None, + description="Maximum number of parallel sandboxes for PER_STEP execution mode. Must be positive integer. " + "Examples: 5 (up to 5 parallel steps), 10 (higher parallelism). Only applies when mode='per_step'. " + "If not specified, Modal determines optimal parallelism based on pipeline structure", + ) + synchronous: bool = Field( + True, + description="Controls whether pipeline execution blocks the client until completion. If True, " + "client waits for all steps to finish before returning. If False, returns immediately " + "and executes asynchronously. Useful for long-running production pipelines", ) @@ -89,9 +133,24 @@ class ModalOrchestratorConfig( are inherited from ModalOrchestratorSettings. """ - token_id: Optional[str] = SecretField(default=None) - token_secret: Optional[str] = SecretField(default=None) - workspace: Optional[str] = None + token_id: Optional[str] = SecretField( + default=None, + description="Modal API token ID for authentication. Must be in format 'ak-xxxxx' as provided by Modal. " + "Example: 'ak-1234567890abcdef'. If not provided, falls back to Modal's default authentication " + "from ~/.modal.toml file. Required for programmatic access to Modal API", + ) + token_secret: Optional[str] = SecretField( + default=None, + description="Modal API token secret for authentication. Must be in format 'as-xxxxx' as provided by Modal. " + "Example: 'as-abcdef1234567890'. Used together with token_id for API authentication. " + "If not provided, falls back to Modal's default authentication from ~/.modal.toml file", + ) + workspace: Optional[str] = Field( + None, + description="Modal workspace name for pipeline execution. Must be a valid workspace name " + "you have access to. Examples: 'my-company', 'ml-team', 'personal-workspace'. " + "If not specified, uses the default workspace from Modal configuration", + ) @property def is_remote(self) -> bool: diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 1456af6478e..1176dc60df3 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -43,6 +43,7 @@ ) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, + get_modal_app_name, setup_modal_client, ) from zenml.logger import get_logger @@ -118,8 +119,7 @@ async def prepare_shared_image_cache( logger.info("🔧 Preparing images for step execution") # Create shared Modal app - pipeline_name = deployment.pipeline_configuration.name.replace("_", "-") - app_name = f"zenml-pipeline-{pipeline_name}" + app_name = get_modal_app_name(settings, deployment) shared_app = modal.App.lookup( app_name, create_if_missing=True, diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 05d6e7ace35..9bb88d4511a 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -32,6 +32,7 @@ from zenml.integrations.modal.utils import ( generate_sandbox_tags, get_gpu_values, + get_modal_app_name, get_or_build_modal_image, get_resource_settings_from_deployment, get_resource_values, @@ -80,10 +81,7 @@ def __init__( self.app_name = shared_app.name else: # Create Modal app for this pipeline - pipeline_name = deployment.pipeline_configuration.name.replace( - "_", "-" - ) - self.app_name = f"zenml-pipeline-{pipeline_name}" + self.app_name = get_modal_app_name(settings, deployment) self.app = modal.App.lookup( self.app_name, create_if_missing=True, diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 7aaa294e793..1d0ece0ea26 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -14,7 +14,7 @@ """Shared utilities for Modal integration components.""" import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import modal @@ -24,6 +24,12 @@ from zenml.logger import get_logger from zenml.stack import Stack, StackValidator +if TYPE_CHECKING: + from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( + ModalOrchestratorSettings, + ) + from zenml.models import PipelineDeploymentResponse + logger = get_logger(__name__) # Common environment variable for Modal orchestrator run ID @@ -488,3 +494,25 @@ def get_resource_settings_from_deployment( resource_settings.gpu_count, ) return resource_settings + + +def get_modal_app_name( + settings: "ModalOrchestratorSettings", + deployment: "PipelineDeploymentResponse", +) -> str: + """Get the Modal app name from settings or generate default from pipeline name. + + Args: + settings: Modal orchestrator settings object. + deployment: The pipeline deployment object. + + Returns: + The Modal app name to use. + """ + if settings.app_name: + return settings.app_name + else: + pipeline_name = deployment.pipeline_configuration.name.replace( + "_", "-" + ) + return f"zenml-pipeline-{pipeline_name}" From 8ec1c08c5d8924d5bfc395e004d451ac9e98c026 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Tue, 15 Jul 2025 13:51:44 +0200 Subject: [PATCH 72/77] Add pydantic Field descriptions to ModalStepOperatorSettings --- .../flavors/modal_step_operator_flavor.py | 57 ++++++++++++++++--- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py index 942ee91f166..aeff5a94025 100644 --- a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py @@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Optional, Type +from pydantic import Field from zenml.config.base_settings import BaseSettings from zenml.integrations.modal import MODAL_STEP_OPERATOR_FLAVOR from zenml.step_operators import BaseStepOperatorConfig, BaseStepOperatorFlavor @@ -45,11 +46,36 @@ class ModalStepOperatorSettings(BaseSettings): timeout: Maximum execution time in seconds (default 24h). """ - gpu: Optional[str] = None - region: Optional[str] = None - cloud: Optional[str] = None - modal_environment: Optional[str] = None - timeout: int = 86400 # 24 hours (Modal's maximum) + gpu: Optional[str] = Field( + None, + description="GPU type for step execution. Must be a valid Modal GPU type. " + "Examples: 'T4' (cost-effective), 'A100' (high-performance), 'V100' (training workloads). " + "Use ResourceSettings.gpu_count to specify number of GPUs. If not specified, uses CPU-only execution" + ) + region: Optional[str] = Field( + None, + description="Cloud region for step execution. Must be a valid region for the selected cloud provider. " + "Examples: 'us-east-1', 'us-west-2', 'eu-west-1'. If not specified, Modal uses default region " + "based on cloud provider and availability" + ) + cloud: Optional[str] = Field( + None, + description="Cloud provider for step execution. Must be a valid Modal-supported cloud provider. " + "Examples: 'aws', 'gcp'. If not specified, Modal uses default cloud provider " + "based on workspace configuration" + ) + modal_environment: Optional[str] = Field( + None, + description="Modal environment name for step execution. Must be a valid environment " + "configured in your Modal workspace. Examples: 'main', 'staging', 'production'. " + "If not specified, uses the default environment for the workspace" + ) + timeout: int = Field( + 86400, + description="Maximum execution time in seconds for step completion. Must be between 1 and 86400 seconds. " + "Examples: 3600 (1 hour), 7200 (2 hours), 86400 (24 hours maximum). " + "Step execution will be terminated if it exceeds this timeout" + ) class ModalStepOperatorConfig( @@ -68,9 +94,24 @@ class ModalStepOperatorConfig( are inherited from ModalStepOperatorSettings. """ - token_id: Optional[str] = SecretField(default=None) - token_secret: Optional[str] = SecretField(default=None) - workspace: Optional[str] = None + token_id: Optional[str] = SecretField( + default=None, + description="Modal API token ID for authentication. Must be in format 'ak-xxxxx' as provided by Modal. " + "Example: 'ak-1234567890abcdef'. If not provided, falls back to Modal's default authentication " + "from ~/.modal.toml file. Required for programmatic access to Modal API" + ) + token_secret: Optional[str] = SecretField( + default=None, + description="Modal API token secret for authentication. Must be in format 'as-xxxxx' as provided by Modal. " + "Example: 'as-abcdef1234567890'. Used together with token_id for API authentication. " + "If not provided, falls back to Modal's default authentication from ~/.modal.toml file" + ) + workspace: Optional[str] = Field( + None, + description="Modal workspace name for step execution. Must be a valid workspace name " + "you have access to. Examples: 'my-company', 'ml-team', 'personal-workspace'. " + "If not specified, uses the default workspace from Modal configuration" + ) @property def is_remote(self) -> bool: From d272580dee1c5c5cccaa38a8d62cd920f8ac68c1 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Tue, 29 Jul 2025 14:00:19 +0200 Subject: [PATCH 73/77] Start some cleanup --- src/zenml/config/pipeline_configurations.py | 21 +- .../flavors/modal_step_operator_flavor.py | 17 +- .../modal/orchestrators/modal_orchestrator.py | 32 +- .../orchestrators/modal_sandbox_executor.py | 292 +++++------------- .../step_operators/modal_step_operator.py | 4 +- src/zenml/integrations/modal/utils.py | 190 +----------- .../test_modal_sandbox_executor.py | 8 +- 7 files changed, 133 insertions(+), 431 deletions(-) diff --git a/src/zenml/config/pipeline_configurations.py b/src/zenml/config/pipeline_configurations.py index 8d7910fd93b..955d7f55f80 100644 --- a/src/zenml/config/pipeline_configurations.py +++ b/src/zenml/config/pipeline_configurations.py @@ -18,7 +18,7 @@ from pydantic import SerializeAsAny, field_validator -from zenml.config.constants import DOCKER_SETTINGS_KEY +from zenml.config.constants import DOCKER_SETTINGS_KEY, RESOURCE_SETTINGS_KEY from zenml.config.retry_config import StepRetryConfig from zenml.config.source import SourceWithValidator from zenml.config.strict_base_model import StrictBaseModel @@ -27,7 +27,7 @@ from zenml.utils.time_utils import utc_now if TYPE_CHECKING: - from zenml.config import DockerSettings + from zenml.config import DockerSettings, ResourceSettings from zenml.config.base_settings import BaseSettings, SettingsOrDict @@ -117,3 +117,20 @@ def docker_settings(self) -> "DockerSettings": DOCKER_SETTINGS_KEY, {} ) return DockerSettings.model_validate(model_or_dict) + + @property + def resource_settings(self) -> "ResourceSettings": + """Resource settings of this step configuration. + + Returns: + The resource settings of this step configuration. + """ + from zenml.config import ResourceSettings + + model_or_dict: SettingsOrDict = self.settings.get( + RESOURCE_SETTINGS_KEY, {} + ) + + if isinstance(model_or_dict, BaseSettings): + model_or_dict = model_or_dict.model_dump() + return ResourceSettings.model_validate(model_or_dict) diff --git a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py index aeff5a94025..4585460e34f 100644 --- a/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py +++ b/src/zenml/integrations/modal/flavors/modal_step_operator_flavor.py @@ -16,6 +16,7 @@ from typing import TYPE_CHECKING, Optional, Type from pydantic import Field + from zenml.config.base_settings import BaseSettings from zenml.integrations.modal import MODAL_STEP_OPERATOR_FLAVOR from zenml.step_operators import BaseStepOperatorConfig, BaseStepOperatorFlavor @@ -50,31 +51,31 @@ class ModalStepOperatorSettings(BaseSettings): None, description="GPU type for step execution. Must be a valid Modal GPU type. " "Examples: 'T4' (cost-effective), 'A100' (high-performance), 'V100' (training workloads). " - "Use ResourceSettings.gpu_count to specify number of GPUs. If not specified, uses CPU-only execution" + "Use ResourceSettings.gpu_count to specify number of GPUs. If not specified, uses CPU-only execution", ) region: Optional[str] = Field( None, description="Cloud region for step execution. Must be a valid region for the selected cloud provider. " "Examples: 'us-east-1', 'us-west-2', 'eu-west-1'. If not specified, Modal uses default region " - "based on cloud provider and availability" + "based on cloud provider and availability", ) cloud: Optional[str] = Field( None, description="Cloud provider for step execution. Must be a valid Modal-supported cloud provider. " "Examples: 'aws', 'gcp'. If not specified, Modal uses default cloud provider " - "based on workspace configuration" + "based on workspace configuration", ) modal_environment: Optional[str] = Field( None, description="Modal environment name for step execution. Must be a valid environment " "configured in your Modal workspace. Examples: 'main', 'staging', 'production'. " - "If not specified, uses the default environment for the workspace" + "If not specified, uses the default environment for the workspace", ) timeout: int = Field( 86400, description="Maximum execution time in seconds for step completion. Must be between 1 and 86400 seconds. " "Examples: 3600 (1 hour), 7200 (2 hours), 86400 (24 hours maximum). " - "Step execution will be terminated if it exceeds this timeout" + "Step execution will be terminated if it exceeds this timeout", ) @@ -98,19 +99,19 @@ class ModalStepOperatorConfig( default=None, description="Modal API token ID for authentication. Must be in format 'ak-xxxxx' as provided by Modal. " "Example: 'ak-1234567890abcdef'. If not provided, falls back to Modal's default authentication " - "from ~/.modal.toml file. Required for programmatic access to Modal API" + "from ~/.modal.toml file. Required for programmatic access to Modal API", ) token_secret: Optional[str] = SecretField( default=None, description="Modal API token secret for authentication. Must be in format 'as-xxxxx' as provided by Modal. " "Example: 'as-abcdef1234567890'. Used together with token_id for API authentication. " - "If not provided, falls back to Modal's default authentication from ~/.modal.toml file" + "If not provided, falls back to Modal's default authentication from ~/.modal.toml file", ) workspace: Optional[str] = Field( None, description="Modal workspace name for step execution. Must be a valid workspace name " "you have access to. Examples: 'my-company', 'ml-team', 'personal-workspace'. " - "If not specified, uses the default workspace from Modal configuration" + "If not specified, uses the default workspace from Modal configuration", ) @property diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 54afd91c9bc..7627557ecd9 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -35,7 +35,7 @@ ) from zenml.integrations.modal.utils import ( ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID, - create_modal_stack_validator, + get_modal_stack_validator, setup_modal_client, ) from zenml.logger import get_logger @@ -57,11 +57,7 @@ class ModalOrchestrator(ContainerizedOrchestrator): - """Orchestrator responsible for running entire pipelines on Modal. - - This orchestrator runs complete pipelines using Modal sandboxes - for maximum flexibility and efficiency with persistent app architecture. - """ + """Orchestrator responsible for running entire pipelines on Modal.""" @property def config(self) -> "ModalOrchestratorConfig": @@ -143,7 +139,7 @@ def validator(self) -> Optional[StackValidator]: Returns: A `StackValidator` instance. """ - return create_modal_stack_validator() + return get_modal_stack_validator() def get_orchestrator_run_id(self) -> str: """Returns the active orchestrator run id. @@ -186,13 +182,10 @@ def submit_pipeline( Returns: Optional submission result with wait function if synchronous. """ - # Setup Modal authentication self._setup_modal_client() - # Pass pipeline run ID for proper isolation if placeholder_run: environment["ZENML_PIPELINE_RUN_ID"] = str(placeholder_run.id) - logger.debug(f"Pipeline run ID: {placeholder_run.id}") # Get settings from pipeline configuration settings = cast( @@ -211,25 +204,16 @@ def submit_pipeline( settings=settings, ) - # Determine if we should wait for completion - synchronous = ( - settings.synchronous - if hasattr(settings, "synchronous") - else self.config.synchronous - ) + run_id = placeholder_run.id if placeholder_run else None - # Submit the pipeline - run_id = str(placeholder_run.id) if placeholder_run else None + if settings.synchronous: - # Execute the pipeline based on synchronous setting - if synchronous: - # Return a wait function that will execute the pipeline when called def _wait_for_completion() -> None: async def _execute_pipeline() -> None: try: await executor.execute_pipeline( run_id=run_id, - synchronous=True, # Wait for completion + synchronous=True, ) logger.info( "✅ Pipeline execution completed successfully" @@ -242,12 +226,12 @@ async def _execute_pipeline() -> None: return SubmissionResult(wait_for_completion=_wait_for_completion) else: - # Fire and forget - execute the pipeline asynchronously + async def _execute_pipeline() -> None: try: await executor.execute_pipeline( run_id=run_id, - synchronous=False, # Don't wait for completion + synchronous=False, ) logger.info("✅ Pipeline submitted successfully") except Exception as e: diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 9bb88d4511a..73b31be31db 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -13,13 +13,13 @@ # permissions and limitations under the License. """Modal sandbox executor for ZenML orchestration.""" -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +from uuid import UUID import modal from zenml.client import Client -from zenml.config.constants import RESOURCE_SETTINGS_KEY -from zenml.config.resource_settings import ResourceSettings +from zenml.config.resource_settings import ByteUnit, ResourceSettings from zenml.entrypoints.step_entrypoint_configuration import ( StepEntrypointConfiguration, ) @@ -31,11 +31,8 @@ ) from zenml.integrations.modal.utils import ( generate_sandbox_tags, - get_gpu_values, get_modal_app_name, get_or_build_modal_image, - get_resource_settings_from_deployment, - get_resource_values, ) from zenml.logger import get_logger @@ -88,122 +85,58 @@ def __init__( environment_name=settings.modal_environment, ) - def _build_entrypoint_command( - self, base_command: List[str], args: List[str] - ) -> List[str]: - """Build the complete entrypoint command (without environment variables). - - Environment variables are now passed via secrets parameter to sandbox. - - Args: - base_command: Base command to execute. - args: Arguments for the command. - - Returns: - Complete command without environment variables. - """ - return base_command + args - # --------------------------------------------------------------------- # Resource utilities # --------------------------------------------------------------------- - @staticmethod - def _to_resource_settings(data: Any | None) -> ResourceSettings: - """Convert arbitrary input to a ``ResourceSettings`` object. - - This helper makes sure that we *always* return a properly validated - ``ResourceSettings`` instance. It gracefully handles different shapes - that may appear in historical deployments (actual instance, pydantic - model, plain dict, or even a generic object with ``__dict__``). + def _get_settings( + self, step_name: Optional[str] = None + ) -> ModalOrchestratorSettings: + """Get settings for a specific step or pipeline. Args: - data: Raw resource settings information. + step_name: Optional step name for which to fetch settings. If not + given, pipeline-level settings are returned. Returns: - A validated ``ResourceSettings`` instance (empty when no data). + Pipeline or step settings. """ - # Already a ResourceSettings – just return - if isinstance(data, ResourceSettings): - return data - - # Nothing configured – return an empty instance - if data is None: - return ResourceSettings() - - # Convert pydantic/BaseSettings models to dict first - if hasattr(data, "model_dump"): - try: - data = data.model_dump() - except Exception: - # Fallback to __dict__ if model_dump fails for some reason - data = getattr(data, "__dict__", {}) - - # Convert mapping-like objects to dict - if not isinstance(data, dict): - try: - data = dict(data) - except Exception: - # If conversion fails, return empty settings instead of error - logger.warning( - "Unable to interpret resource settings of type %s – falling back to default.", - type(data), - ) - return ResourceSettings() - - # Finally validate - try: - return ResourceSettings.model_validate(data) - except Exception as e: - logger.warning( - "Failed to validate resource settings %s – %s. Using default.", - data, - e, - ) - return ResourceSettings() + container = ( + self.deployment.step_configurations[step_name] + if step_name + else self.deployment + ) + return cast( + ModalOrchestratorSettings, + self.stack.orchestrator.get_settings(container), + ) - def _get_step_settings(self, step_name: str) -> ModalOrchestratorSettings: - """Get merged settings for a specific step. + def _get_resource_settings( + self, step_name: Optional[str] = None + ) -> ResourceSettings: + """Return validated resource settings for either pipeline or step. Args: - step_name: Name of the step. + step_name: Optional name of the step for which to fetch resource + settings. If ``None`` (default), pipeline-level settings are + returned. Returns: - Merged Modal orchestrator settings. + A validated ``ResourceSettings`` object (never ``None``). """ - # Start with pipeline-level settings - pipeline_settings_dict = self.settings.model_dump() - - # Get step-specific settings - if step_name in self.deployment.step_configurations: - step_config = self.deployment.step_configurations[step_name].config - step_settings = step_config.settings.get("orchestrator.modal") - - if step_settings: - # Handle both dict and Pydantic model cases - if hasattr(step_settings, "model_dump"): - step_settings_data = step_settings.model_dump() - else: - step_settings_data = ( - dict(step_settings) if step_settings else {} - ) + if step_name: + return self.deployment.step_configurations[ + step_name + ].config.resource_settings + else: + return self.deployment.pipeline_configuration.resource_settings - step_modal_settings = ModalOrchestratorSettings.model_validate( - step_settings_data - ) - # Merge step settings over pipeline settings - step_settings_dict = step_modal_settings.model_dump( - exclude_unset=True - ) - for key, value in step_settings_dict.items(): - if value is not None: - pipeline_settings_dict[key] = value - - # Create merged settings from the combined dictionary - merged_settings = ModalOrchestratorSettings.model_validate( - pipeline_settings_dict - ) - return merged_settings + # TODO: Maybe use defaults? + # resource_settings = ResourceSettings( + # cpu_count=1, + # memory="1024MB", + # gpu_count=0, + # ) def _create_environment_secret(self) -> Optional[Any]: """Create a Modal secret containing environment variables. @@ -222,58 +155,6 @@ def _create_environment_secret(self) -> Optional[Any]: } return modal.Secret.from_dict(env_dict) - def _get_resource_settings( - self, step_name: Optional[str] = None - ) -> ResourceSettings: - """Return validated resource settings for either pipeline or step. - - The helper always returns a *proper* :class:`~zenml.config.resource_settings.ResourceSettings` - instance. For a step it checks the step-level settings first and - falls back to an empty configuration; for the pipeline it delegates - to :func:`zenml.integrations.modal.utils.get_resource_settings_from_deployment`. - - Args: - step_name: Optional name of the step for which to fetch resource - settings. If ``None`` (default), pipeline-level settings are - returned. - - Returns: - A validated ``ResourceSettings`` object (never ``None``). - """ - if step_name: - step_cfg = self.deployment.step_configurations[step_name].config - - # 1) direct attribute - res = self._to_resource_settings(step_cfg.resource_settings) - if not res.empty: - logger.debug( - "Using direct resource settings for step %s", step_name - ) - return res - - # 2) settings["resources"] fallback - res = self._to_resource_settings( - step_cfg.settings.get(RESOURCE_SETTINGS_KEY) - ) - if not res.empty: - logger.debug( - "Using settings-key resource settings for step %s", - step_name, - ) - return res - - logger.debug( - "No resource settings for step %s – defaulting", step_name - ) - return ResourceSettings() - - # Pipeline-level: delegate to existing util (already returns RS) - resource_settings = get_resource_settings_from_deployment( - self.deployment, RESOURCE_SETTINGS_KEY - ) - logger.debug("Using pipeline-level resource settings") - return resource_settings - def _get_resource_config( self, step_name: Optional[str] = None ) -> tuple[Optional[str], Optional[int], Optional[int]]: @@ -285,40 +166,36 @@ def _get_resource_config( Returns: Tuple of (gpu_values, cpu_count, memory_mb) with validated values. """ - # Get resource settings using robust extraction + settings = self._get_settings(step_name) resource_settings = self._get_resource_settings(step_name) - # Get GPU configuration (with default type if unspecified but gpu_count > 0) - gpu_type: Optional[str] = None - if step_name: - step_settings = self._get_step_settings(step_name) - gpu_type = step_settings.gpu - else: - gpu_type = self.settings.gpu - - # If gpu_type is missing but gpu_count > 0, default to T4 - if ( - gpu_type is None - and resource_settings.gpu_count - and resource_settings.gpu_count > 0 - ): + cpu_count: Optional[int] = None + if resource_settings.cpu_count is not None: + cpu_count = int(resource_settings.cpu_count) + + memory_mb: Optional[int] = None + if resource_settings.memory: + memory_mb = int(resource_settings.get_memory(ByteUnit.MB)) + + gpu_value = None + gpu_type = settings.gpu + gpu_count = resource_settings.gpu_count + + if not gpu_type and gpu_count is not None: gpu_type = "T4" logger.debug( f"No GPU type specified for {'step ' + step_name if step_name else 'pipeline'}, " - f"but gpu_count={resource_settings.gpu_count}. Defaulting to {gpu_type}." + f"but gpu_count={gpu_count}. Defaulting to {gpu_type}." ) - gpu_values = get_gpu_values(gpu_type, resource_settings) - # Get CPU and memory with validation - cpu_count, memory_mb = get_resource_values(resource_settings) - - # Log resource configuration for debugging - logger.debug( - f"Resource config for {step_name or 'pipeline'}: " - f"GPU={gpu_values}, CPU={cpu_count}, Memory={memory_mb}MB" - ) + if gpu_count == 0: + gpu_value = None + elif gpu_count is None: + gpu_value = gpu_type + else: + gpu_value = f"{gpu_type}:{gpu_count}" - return gpu_values, cpu_count, memory_mb + return gpu_value, cpu_count, memory_mb def _prepare_modal_api_params( self, @@ -411,7 +288,7 @@ async def _execute_sandbox( entrypoint_command: List[str], mode: str, step_name: Optional[str] = None, - run_id: Optional[str] = None, + run_id: Optional[UUID] = None, synchronous: bool = True, ) -> None: """Execute a sandbox with the given command. @@ -428,7 +305,7 @@ async def _execute_sandbox( # Get settings (step-specific for steps, pipeline-level for pipeline) if step_name: - step_settings = self._get_step_settings(step_name) + step_settings = self._get_settings(step_name) cloud = step_settings.cloud region = step_settings.region timeout = step_settings.timeout @@ -524,27 +401,6 @@ async def _store_image_id(self, zenml_image: Any) -> None: except Exception as e: logger.warning(f"Failed to store image ID: {e}") - def _get_image_name(self, step_name: Optional[str] = None) -> str: - """Get the image name for the pipeline or step. - - Args: - step_name: Name of the step (None for pipeline-level). - - Returns: - Image name to use. - """ - # Import here to avoid circular imports - from zenml.integrations.modal.orchestrators.modal_orchestrator import ( - ModalOrchestrator, - ) - - if step_name: - return ModalOrchestrator.get_image( - deployment=self.deployment, step_name=step_name - ) - else: - return ModalOrchestrator.get_image(deployment=self.deployment) - def _get_cached_or_build_image( self, step_name: Optional[str] = None ) -> Any: @@ -564,7 +420,13 @@ def _get_cached_or_build_image( ValueError: If the deployment does not have an associated build (required to identify the Docker image). """ - image_name = self._get_image_name(step_name) + from zenml.integrations.modal.orchestrators.modal_orchestrator import ( + ModalOrchestrator, + ) + + image_name = ModalOrchestrator.get_image( + deployment=self.deployment, step_name=step_name + ) # Check shared cache first cache_key = self._get_image_cache_key(image_name, step_name) @@ -621,7 +483,7 @@ def _get_image_cache_key( async def execute_pipeline( self, - run_id: Optional[str] = None, + run_id: Optional[UUID] = None, synchronous: bool = True, ) -> None: """Execute the entire pipeline in a single sandbox. @@ -636,20 +498,14 @@ async def execute_pipeline( command = ( ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() ) - from uuid import UUID - - # Convert run_id to UUID if it's a string - run_id_uuid = None - if run_id is not None: - run_id_uuid = UUID(run_id) if isinstance(run_id, str) else run_id args = ( ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( deployment_id=self.deployment.id, - run_id=run_id_uuid, + run_id=run_id, ) ) - entrypoint_command = self._build_entrypoint_command(command, args) + entrypoint_command = command + args # Execute pipeline sandbox await self._execute_sandbox( @@ -672,7 +528,7 @@ async def execute_step(self, step_name: str) -> None: args = StepEntrypointConfiguration.get_entrypoint_arguments( step_name=step_name, deployment_id=self.deployment.id ) - entrypoint_command = self._build_entrypoint_command(command, args) + entrypoint_command = command + args # Execute step sandbox await self._execute_sandbox( diff --git a/src/zenml/integrations/modal/step_operators/modal_step_operator.py b/src/zenml/integrations/modal/step_operators/modal_step_operator.py index 094a121bb2c..535d08fde3d 100644 --- a/src/zenml/integrations/modal/step_operators/modal_step_operator.py +++ b/src/zenml/integrations/modal/step_operators/modal_step_operator.py @@ -27,8 +27,8 @@ ) from zenml.integrations.modal.utils import ( build_modal_image, - create_modal_stack_validator, get_gpu_values, + get_modal_stack_validator, setup_modal_client, ) from zenml.logger import get_logger @@ -77,7 +77,7 @@ def validator(self) -> Optional[StackValidator]: Returns: The stack validator. """ - return create_modal_stack_validator() + return get_modal_stack_validator() def get_docker_builds( self, deployment: "PipelineDeploymentBase" diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 1d0ece0ea26..7288a2b5d13 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -14,12 +14,12 @@ """Shared utilities for Modal integration components.""" import os -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple +from uuid import UUID import modal from zenml.config import ResourceSettings -from zenml.config.resource_settings import ByteUnit from zenml.enums import StackComponentType from zenml.logger import get_logger from zenml.stack import Stack, StackValidator @@ -32,36 +32,9 @@ logger = get_logger(__name__) -# Common environment variable for Modal orchestrator run ID ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID = "ZENML_MODAL_ORCHESTRATOR_RUN_ID" -class ModalAuthenticationError(Exception): - """Exception raised for Modal authentication issues with helpful guidance.""" - - def __init__(self, message: str, suggestions: Optional[List[str]] = None): - """Initialize the authentication error with message and optional suggestions. - - Args: - message: The error message. - suggestions: Optional list of suggestions for fixing the issue. - """ - super().__init__(message) - self.suggestions = suggestions or [] - - def __str__(self) -> str: - """Return formatted error message with suggestions. - - Returns: - Formatted error message, optionally with suggestions. - """ - base_message = super().__str__() - if self.suggestions: - suggestions_text = "\n".join(f" • {s}" for s in self.suggestions) - return f"{base_message}\n\nSuggestions:\n{suggestions_text}" - return base_message - - def setup_modal_client( token_id: Optional[str] = None, token_secret: Optional[str] = None, @@ -149,96 +122,32 @@ def setup_modal_client( os.environ["MODAL_ENVIRONMENT"] = environment +# TODO: refactor step operator and remove this def get_gpu_values( gpu_type: Optional[str], resource_settings: ResourceSettings ) -> Optional[str]: """Get the GPU values for Modal components. - This function unifies GPU configuration from both Modal orchestrator settings - and ResourceSettings. It prioritizes explicit GPU type from Modal settings, - but falls back to ResourceSettings for GPU count and type. - Args: gpu_type: The GPU type from Modal settings (e.g., "T4", "A100"). resource_settings: The resource settings containing GPU configuration. Returns: The GPU string for Modal API, or None if no GPU requested. - Format: "GPU_TYPE" or "GPU_TYPE:COUNT" """ - # Check if GPU is requested via ResourceSettings - gpu_count = resource_settings.gpu_count - - # No GPU requested if no type specified if not gpu_type: return None - # No GPU requested if count is explicitly 0 + gpu_count = resource_settings.gpu_count if gpu_count == 0: return None - - # GPU type specified but no count, return just the type - if gpu_count is None: + elif gpu_count is None: return gpu_type + else: + return f"{gpu_type}:{gpu_count}" - # Both type and count specified - return f"{gpu_type}:{gpu_count}" - - -def get_resource_values( - resource_settings: ResourceSettings, -) -> Tuple[Optional[int], Optional[int]]: - """Get CPU and memory values from resource settings with validation. - - Args: - resource_settings: The resource settings. - - Returns: - Tuple of (cpu_count, memory_mb) with validated values. - """ - # Get CPU count with validation - cpu_count: Optional[int] = None - if resource_settings.cpu_count is not None: - cpu_count = int(resource_settings.cpu_count) - # Validate CPU count is reasonable - if cpu_count <= 0: - logger.warning(f"Invalid CPU count {cpu_count}, ignoring.") - cpu_count = None - elif cpu_count > 96: # Modal's typical max - logger.warning( - f"CPU count {cpu_count} is very high. " - "Consider if this is intentional." - ) - # Convert memory to MB if needed with validation - memory_mb: Optional[int] = None - if resource_settings.memory: - try: - memory_value = resource_settings.get_memory(ByteUnit.MB) - if memory_value is not None: - memory_mb = int(memory_value) - # Validate memory is reasonable - if memory_mb <= 0: - logger.warning(f"Invalid memory {memory_mb}MB, ignoring.") - memory_mb = None - elif memory_mb < 128: # Less than 128MB seems too low - logger.warning( - f"Memory {memory_mb}MB is very low. " - "Consider if this is intentional." - ) - elif memory_mb > 1024 * 1024: # More than 1TB seems high - logger.warning( - f"Memory {memory_mb}MB is very high. " - "Consider if this is intentional." - ) - except Exception as e: - logger.warning(f"Failed to parse memory setting: {e}") - memory_mb = None - - return cpu_count, memory_mb - - -def _build_modal_image_from_registry( +def build_modal_image( image_name: str, stack: "Stack", environment: Optional[Dict[str, str]] = None, @@ -339,7 +248,7 @@ def get_or_build_modal_image( pass # Build new image using shared helper - zenml_image = _build_modal_image_from_registry( + zenml_image = build_modal_image( image_name=image_name, stack=stack, environment=None, # No environment variables for cached images @@ -351,35 +260,12 @@ def get_or_build_modal_image( return zenml_image -def build_modal_image( - image_name: str, - stack: "Stack", - environment: Optional[Dict[str, str]] = None, -) -> Any: - """Build a Modal image from a ZenML-built Docker image. - - Args: - image_name: The name of the Docker image to use as base. - stack: The ZenML stack containing container registry. - environment: The environment variables to pass to the image. - - Returns: - The configured Modal image. - """ - # Use shared helper for image building - return _build_modal_image_from_registry( - image_name=image_name, - stack=stack, - environment=environment, - ) - - def generate_sandbox_tags( pipeline_name: str, deployment_id: str, execution_mode: str, step_name: Optional[str] = None, - run_id: Optional[str] = None, + run_id: Optional[UUID] = None, ) -> Dict[str, str]: """Generate tags for Modal sandboxes. @@ -404,16 +290,19 @@ def generate_sandbox_tags( tags["zenml_step"] = step_name if run_id: - tags["zenml_run_id"] = run_id + tags["zenml_run_id"] = str(run_id) return tags -def create_modal_stack_validator() -> StackValidator: - """Create a stack validator for Modal components. +def get_modal_stack_validator() -> StackValidator: + """Get a stack validator for Modal components. + + The validator ensures that the stack contains a remote artifact store and + container registry. Returns: - A StackValidator that ensures remote artifact store and container registry. + A stack validator for modal components. """ def _validate_remote_components(stack: "Stack") -> Tuple[bool, str]: @@ -451,51 +340,6 @@ def _validate_remote_components(stack: "Stack") -> Tuple[bool, str]: ) -def get_resource_settings_from_deployment( - deployment: Any, - resource_settings_key: str = "resources", -) -> ResourceSettings: - """Extract resource settings from pipeline deployment. - - Args: - deployment: The pipeline deployment. - resource_settings_key: Key to look for resource settings. - - Returns: - ResourceSettings object with the configuration. - """ - pipeline_resource_settings: Union[Dict[str, Any], Any] = ( - deployment.pipeline_configuration.settings.get( - resource_settings_key, {} - ) - ) - if pipeline_resource_settings: - # Convert to dict if it's a BaseSettings instance - if hasattr(pipeline_resource_settings, "model_dump"): - pipeline_resource_dict = pipeline_resource_settings.model_dump() - else: - pipeline_resource_dict = pipeline_resource_settings - resource_settings = ResourceSettings.model_validate( - pipeline_resource_dict - ) - else: - # No explicit pipeline resources: use sane defaults (ignore step-level) - # As per user request: for pipeline mode, do not fallback to max(step resources) - resource_settings = ResourceSettings( - cpu_count=1, - memory="1024MB", - gpu_count=0, - ) - logger.debug( - "No explicit pipeline-level resource settings found. " - "Using sane defaults: %s CPU, %s memory, %s GPU", - resource_settings.cpu_count, - resource_settings.memory, - resource_settings.gpu_count, - ) - return resource_settings - - def get_modal_app_name( settings: "ModalOrchestratorSettings", deployment: "PipelineDeploymentResponse", diff --git a/tests/integration/integrations/modal/orchestrators/test_modal_sandbox_executor.py b/tests/integration/integrations/modal/orchestrators/test_modal_sandbox_executor.py index 44f72116481..4a414f85df4 100644 --- a/tests/integration/integrations/modal/orchestrators/test_modal_sandbox_executor.py +++ b/tests/integration/integrations/modal/orchestrators/test_modal_sandbox_executor.py @@ -123,7 +123,7 @@ def test_step_modal_settings_override_pipeline_settings(self): } executor = self._create_executor() - result = executor._get_step_settings(step_name) + result = executor._get_settings(step_name) # Step overrides should take precedence assert result.gpu == "V100" # Step override @@ -147,7 +147,7 @@ def test_partial_step_overrides_preserve_pipeline_defaults(self): } executor = self._create_executor() - result = executor._get_step_settings(step_name) + result = executor._get_settings(step_name) assert result.gpu == "T4" # Step override assert result.cloud == "aws" # Pipeline default preserved @@ -197,7 +197,7 @@ def test_complete_resource_merging_integration( ) # Test step settings - step_settings = executor._get_step_settings(step_name) + step_settings = executor._get_settings(step_name) # Assert step resources are used assert gpu_values == "A100:4" @@ -237,7 +237,7 @@ def test_fallback_to_pipeline_when_no_step_config(self): assert step_resources.cpu_count is None # Should get pipeline Modal settings (fallback) - step_settings = executor._get_step_settings(step_name) + step_settings = executor._get_settings(step_name) assert step_settings.gpu == "A100" # Pipeline setting assert step_settings.cloud == "aws" # Pipeline setting assert step_settings.region == "us-east-1" # Pipeline setting From 1490d395f001644e3249ba23108192670af58bd1 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Tue, 29 Jul 2025 15:31:00 +0200 Subject: [PATCH 74/77] Next round of cleanup --- .../modal_orchestrator_entrypoint.py | 86 ++++++------------- .../orchestrators/modal_sandbox_executor.py | 84 +++++++----------- src/zenml/integrations/modal/utils.py | 72 ++++++---------- 3 files changed, 84 insertions(+), 158 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py index 1176dc60df3..01c4a425cd7 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator_entrypoint.py @@ -92,8 +92,7 @@ def run_step_on_modal( async def prepare_shared_image_cache( deployment: "PipelineDeploymentResponse", stack: "Stack", - settings: ModalOrchestratorSettings, -) -> tuple[Dict[str, Any], Any]: +) -> Dict[str, modal.Image]: """Pre-build all required images for pipeline steps and create shared Modal app. This function analyzes all steps in the deployment, identifies unique images @@ -105,75 +104,39 @@ async def prepare_shared_image_cache( settings: Modal orchestrator settings. Returns: - Tuple of (shared_image_cache, shared_modal_app). + The shared image cache. Raises: ValueError: If the deployment has no associated build information. Exception: For any unexpected error while building images. """ - from zenml.integrations.modal.orchestrators.modal_orchestrator import ( - ModalOrchestrator, - ) from zenml.integrations.modal.utils import get_or_build_modal_image logger.info("🔧 Preparing images for step execution") - # Create shared Modal app - app_name = get_modal_app_name(settings, deployment) - shared_app = modal.App.lookup( - app_name, - create_if_missing=True, - environment_name=settings.modal_environment, - ) - - image_cache: Dict[str, Any] = {} - - # Collect all unique images needed across all steps - unique_images: Dict[str, str] = {} # cache_key -> image_name - # Check if deployment has a build if deployment.build is None: raise ValueError( "Deployment build is None, cannot prepare image cache" ) - # Add pipeline-level image if needed - pipeline_image = ModalOrchestrator.get_image(deployment=deployment) - build_id = str(deployment.build.id) - pipeline_cache_key = ( - f"{build_id}_pipeline_{str(hash(pipeline_image))[-8:]}" - ) - unique_images[pipeline_cache_key] = pipeline_image + image_cache: Dict[str, modal.Image] = {} - # Add step-specific images - for step_name in deployment.step_configurations: - step_image = ModalOrchestrator.get_image( - deployment=deployment, step_name=step_name - ) - image_hash = str(hash(step_image))[-8:] - step_cache_key = f"{build_id}_{step_name}_{image_hash}" - unique_images[step_cache_key] = step_image - - # Build all unique images - logger.debug(f"Building {len(unique_images)} unique images for pipeline") - for cache_key, image_name in unique_images.items(): - logger.debug(f"Building image: {cache_key} from {image_name}") + for build_item in deployment.build.images.values(): try: - built_image = get_or_build_modal_image( - image_name=image_name, + cached_image = get_or_build_modal_image( stack=stack, pipeline_name=deployment.pipeline_configuration.name, - build_id=build_id, - app=shared_app, + build_item=build_item, + build_id=deployment.build.id, ) - image_cache[cache_key] = built_image - logger.debug(f"Successfully cached image: {cache_key}") + image_cache[build_item.image] = cached_image except Exception as e: - logger.error(f"Failed to build image {cache_key}: {e}") + logger.error(f"Failed to build image {build_item.image}: {e}") raise logger.info(f"✅ Prepared {len(image_cache)} container images") - return image_cache, shared_app + return image_cache def execute_pipeline_mode(args: argparse.Namespace) -> None: @@ -182,7 +145,7 @@ def execute_pipeline_mode(args: argparse.Namespace) -> None: Args: args: Parsed command line arguments. """ - logger.debug("Executing entire pipeline in single sandbox") + logger.debug("Executing pipeline sequentially in this sandbox") entrypoint_args = PipelineEntrypointConfiguration.get_entrypoint_arguments( deployment_id=args.deployment_id ) @@ -209,11 +172,17 @@ def execute_per_step_mode( orchestrator_run_id: The orchestrator run ID. """ logger.debug("Executing pipeline with per-step sandboxes") - shared_image_cache, shared_app = asyncio.run( + + app = modal.App.lookup( + get_modal_app_name(pipeline_settings, deployment), + create_if_missing=True, + environment_name=pipeline_settings.modal_environment, + ) + + shared_image_cache = asyncio.run( prepare_shared_image_cache( deployment=deployment, stack=active_stack, - settings=pipeline_settings, ) ) @@ -224,7 +193,7 @@ def execute_per_step_mode( environment=environment, settings=pipeline_settings, shared_image_cache=shared_image_cache, - shared_app=shared_app, + shared_app=app, ) def run_step_wrapper(step_name: str) -> None: @@ -350,7 +319,6 @@ def main() -> None: orchestrator.get_settings(deployment), ) - # Setup Modal client setup_modal_client( token_id=orchestrator.config.token_id, token_secret=orchestrator.config.token_secret, @@ -358,17 +326,15 @@ def main() -> None: environment=orchestrator.config.modal_environment, ) - environment = get_config_environment_vars() - environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = orchestrator_run_id - - # Check execution mode - mode = pipeline_settings.mode - try: - # Execute pipeline based on execution mode - if mode == ModalExecutionMode.PIPELINE: + if pipeline_settings.mode == ModalExecutionMode.PIPELINE: execute_pipeline_mode(args) else: + environment = get_config_environment_vars() + environment[ENV_ZENML_MODAL_ORCHESTRATOR_RUN_ID] = ( + orchestrator_run_id + ) + execute_per_step_mode( deployment, active_stack, diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 73b31be31db..a5299c65ca6 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Modal sandbox executor for ZenML orchestration.""" +"""Modal sandbox executor.""" -from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast from uuid import UUID import modal @@ -37,6 +37,7 @@ from zenml.logger import get_logger if TYPE_CHECKING: + from zenml.config.step_configurations import Step from zenml.models import PipelineDeploymentResponse from zenml.stack import Stack @@ -52,8 +53,8 @@ def __init__( stack: "Stack", environment: Dict[str, str], settings: ModalOrchestratorSettings, - shared_image_cache: Optional[Dict[str, Any]] = None, - shared_app: Optional[Any] = None, + shared_image_cache: Optional[Dict[str, modal.Image]] = None, + shared_app: Optional[modal.App] = None, ): """Initialize the Modal sandbox executor. @@ -101,7 +102,7 @@ def _get_settings( Returns: Pipeline or step settings. """ - container = ( + container: Union["PipelineDeploymentResponse", "Step"] = ( self.deployment.step_configurations[step_name] if step_name else self.deployment @@ -138,7 +139,7 @@ def _get_resource_settings( # gpu_count=0, # ) - def _create_environment_secret(self) -> Optional[Any]: + def _create_environment_secret(self) -> Optional[modal.Secret]: """Create a Modal secret containing environment variables. Returns: @@ -147,13 +148,9 @@ def _create_environment_secret(self) -> Optional[Any]: if not self.environment: return None - # Create secret from environment variables - # Modal handles efficiency internally - # Cast to Dict[str, str | None] to match Modal's expected type - env_dict: Dict[str, Optional[str]] = { - k: v for k, v in self.environment.items() - } - return modal.Secret.from_dict(env_dict) + return modal.Secret.from_dict( + cast(Dict[str, Optional[str]], self.environment) + ) def _get_resource_config( self, step_name: Optional[str] = None @@ -174,8 +171,8 @@ def _get_resource_config( cpu_count = int(resource_settings.cpu_count) memory_mb: Optional[int] = None - if resource_settings.memory: - memory_mb = int(resource_settings.get_memory(ByteUnit.MB)) + if memory_float := resource_settings.get_memory(ByteUnit.MB): + memory_mb = int(memory_float) gpu_value = None gpu_type = settings.gpu @@ -208,7 +205,7 @@ def _prepare_modal_api_params( region: Optional[str], app: Any, timeout: int, - secrets: List[Any], + secrets: List[modal.Secret], ) -> Dict[str, Any]: """Prepare and validate Modal API parameters. @@ -369,16 +366,15 @@ async def _execute_sandbox( # The image should be hydrated after being used in sandbox creation await self._store_image_id(zenml_image) - async def _store_image_id(self, zenml_image: Any) -> None: + async def _store_image_id(self, modal_image: modal.Image) -> None: """Store the image ID for future caching after sandbox creation. Args: - zenml_image: The Modal image that was used. + modal_image: The Modal image that was used. """ try: - # After sandbox creation, the image should be hydrated - zenml_image.hydrate() - if hasattr(zenml_image, "object_id") and zenml_image.object_id: + modal_image.hydrate() + if hasattr(modal_image, "object_id") and modal_image.object_id: if self.deployment.build is not None: image_name_key = f"zenml_image_{self.deployment.build.id}" @@ -388,7 +384,7 @@ async def _store_image_id(self, zenml_image: Any) -> None: f"zenml-image-cache-{pipeline_name}", create_if_missing=True, ) - stored_id[image_name_key] = zenml_image.object_id + stored_id[image_name_key] = modal_image.object_id logger.debug( f"Stored Modal image ID for build {self.deployment.build.id}" ) @@ -403,7 +399,7 @@ async def _store_image_id(self, zenml_image: Any) -> None: def _get_cached_or_build_image( self, step_name: Optional[str] = None - ) -> Any: + ) -> modal.Image: """Get cached Modal image or build new one if not in cache. This method first checks the shared image cache for an existing image. @@ -414,41 +410,30 @@ def _get_cached_or_build_image( step_name: Name of the step (None for pipeline-level). Returns: - Modal image (either cached or newly built). - - Raises: - ValueError: If the deployment does not have an associated build - (required to identify the Docker image). + The modal image. """ from zenml.integrations.modal.orchestrators.modal_orchestrator import ( ModalOrchestrator, ) + assert self.deployment.build + image_name = ModalOrchestrator.get_image( deployment=self.deployment, step_name=step_name ) - # Check shared cache first - cache_key = self._get_image_cache_key(image_name, step_name) - if cache_key in self.shared_image_cache: - logger.debug( - f"Using cached Modal image for {step_name or 'pipeline'}: {cache_key}" - ) - return self.shared_image_cache[cache_key] + if cached_image := self.shared_image_cache.get(image_name): + return cached_image - # Fallback to existing image building logic - logger.debug( - f"Building new Modal image for {step_name or 'pipeline'}: {image_name}" + build_item = self.deployment.build._get_item( + component_key="ORCHESTRATOR", step=step_name ) - if self.deployment.build is None: - raise ValueError("Deployment build is None, cannot build image") return get_or_build_modal_image( - image_name=image_name, stack=self.stack, pipeline_name=self.deployment.pipeline_configuration.name, - build_id=str(self.deployment.build.id), - app=self.app, + build_item=build_item, + build_id=self.deployment.build.id, ) def _get_image_cache_key( @@ -494,22 +479,18 @@ async def execute_pipeline( """ logger.debug("Executing entire pipeline in single sandbox") - # Build entrypoint command command = ( ModalOrchestratorEntrypointConfiguration.get_entrypoint_command() ) - args = ( ModalOrchestratorEntrypointConfiguration.get_entrypoint_arguments( deployment_id=self.deployment.id, run_id=run_id, ) ) - entrypoint_command = command + args - # Execute pipeline sandbox await self._execute_sandbox( - entrypoint_command=entrypoint_command, + entrypoint_command=command + args, mode="PIPELINE", run_id=run_id, synchronous=synchronous, @@ -523,17 +504,14 @@ async def execute_step(self, step_name: str) -> None: """ logger.debug(f"Executing step '{step_name}' in separate sandbox") - # Build step entrypoint command command = StepEntrypointConfiguration.get_entrypoint_command() args = StepEntrypointConfiguration.get_entrypoint_arguments( step_name=step_name, deployment_id=self.deployment.id ) - entrypoint_command = command + args - # Execute step sandbox await self._execute_sandbox( - entrypoint_command=entrypoint_command, + entrypoint_command=command + args, mode="PER_STEP", step_name=step_name, - synchronous=True, # Steps are always synchronous + synchronous=True, ) diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 7288a2b5d13..465e1520929 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -14,7 +14,7 @@ """Shared utilities for Modal integration components.""" import os -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple +from typing import TYPE_CHECKING, Dict, Optional, Tuple from uuid import UUID import modal @@ -28,7 +28,7 @@ from zenml.integrations.modal.flavors.modal_orchestrator_flavor import ( ModalOrchestratorSettings, ) - from zenml.models import PipelineDeploymentResponse + from zenml.models import BuildItem, PipelineDeploymentResponse logger = get_logger(__name__) @@ -151,7 +151,7 @@ def build_modal_image( image_name: str, stack: "Stack", environment: Optional[Dict[str, str]] = None, -) -> Any: +) -> modal.Image: """Build a Modal image from a Docker registry with authentication. This helper function centralizes the shared logic for building Modal images @@ -168,18 +168,14 @@ def build_modal_image( Raises: RuntimeError: If no Docker credentials are found. - ValueError: If no container registry is found. """ if not stack.container_registry: - raise ValueError( + raise RuntimeError( "No Container registry found in the stack. " "Please add a container registry and ensure " "it is correctly configured." ) - logger.debug("Building new Modal image") - logger.debug(f"Base image: {image_name}") - if docker_creds := stack.container_registry.credentials: docker_username, docker_password = docker_creds else: @@ -187,7 +183,6 @@ def build_modal_image( "No Docker credentials found for the container registry." ) - # Create Modal secret for registry authentication registry_secret = modal.Secret.from_dict( { "REGISTRY_USERNAME": docker_username, @@ -195,69 +190,56 @@ def build_modal_image( } ) - # Build Modal image from the ZenML-built image - logger.debug(f"Building Modal image from base: {image_name}") - logger.debug(f"Creating Modal image from base: {image_name}") - zenml_image = ( - modal.Image.from_registry( - image_name, secret=registry_secret - ).pip_install("modal") # Install Modal in the container - ) + modal_image = modal.Image.from_registry( + image_name, secret=registry_secret + ).pip_install("modal") - # Apply environment variables if provided if environment: - zenml_image = zenml_image.env(environment) + modal_image = modal_image.env(environment) - return zenml_image + return modal_image def get_or_build_modal_image( - image_name: str, stack: "Stack", pipeline_name: str, - build_id: str, - app: Any, -) -> Any: + build_item: "BuildItem", + build_id: UUID, +) -> modal.Image: """Get existing Modal image or build new one based on pipeline name and build ID. Args: - image_name: The name of the Docker image to use as base. stack: The ZenML stack containing container registry. pipeline_name: The pipeline name for caching. + build_item: The build item to use for the image. build_id: The build ID for the image key. - app: The Modal app to store/retrieve images. Returns: The configured Modal image. """ - # Try to get existing image from the app - image_name_key = f"zenml_image_{build_id}" + cache_key = ( + build_item.settings_checksum or f"{build_id}-{build_item.image}" + ) try: - # Try to get stored image ID - stored_id = modal.Dict.from_name(f"zenml-image-cache-{pipeline_name}") - if image_name_key in stored_id: - image_id = stored_id[image_name_key] - existing_image = modal.Image.from_id(image_id) + remote_image_cache = modal.Dict.from_name( + f"zenml-image-cache-{pipeline_name}" + ) + + if modal_image_id := remote_image_cache.get(cache_key): + existing_image = modal.Image.from_id(modal_image_id) logger.debug( - f"Using cached Modal image for build {build_id} in pipeline {pipeline_name}" + f"Using cached Modal image for image {build_item.image} with cache key {cache_key}" ) return existing_image - except (modal.exceptions.NotFoundError, KeyError): - # Dict doesn't exist or image not found, will build new one + except (modal.exception.NotFoundError, KeyError): pass - # Build new image using shared helper - zenml_image = build_modal_image( - image_name=image_name, + return build_modal_image( + image_name=build_item.image, stack=stack, - environment=None, # No environment variables for cached images ) - - # Store the image in the app for future use - setattr(app, image_name_key, zenml_image) - - return zenml_image + # TODO: we should cache this here immediately? def generate_sandbox_tags( From 6f230767f8cf7e489452d345499cdf2769f2f4c1 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Tue, 29 Jul 2025 15:35:33 +0200 Subject: [PATCH 75/77] More cleanup --- .../orchestrators/modal_sandbox_executor.py | 53 ++----------------- src/zenml/integrations/modal/utils.py | 19 ++++--- 2 files changed, 16 insertions(+), 56 deletions(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index a5299c65ca6..1f1a732a1c0 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -311,14 +311,11 @@ async def _execute_sandbox( region = self.settings.region timeout = self.settings.timeout - # Get or build Modal image (with shared cache support) - zenml_image = self._get_cached_or_build_image(step_name) + modal_image = self._get_cached_or_build_image(step_name) - # Create environment secret env_secret = self._create_environment_secret() secrets = [env_secret] if env_secret else [] - # Generate tags tags = generate_sandbox_tags( pipeline_name=self.deployment.pipeline_configuration.name, deployment_id=str(self.deployment.id), @@ -327,13 +324,9 @@ async def _execute_sandbox( run_id=run_id, ) - logger.debug(f"Creating sandbox for {mode.lower()} execution") - logger.debug(f"Sandbox tags: {tags}") - - # Validate and prepare Modal API parameters modal_params = self._prepare_modal_api_params( entrypoint_command=entrypoint_command, - image=zenml_image, + image=modal_image, gpu=gpu_values, cpu=cpu_count, memory=memory_mb, @@ -345,57 +338,17 @@ async def _execute_sandbox( ) with modal.enable_output(): - # Create sandbox with validated parameters - # Pass entrypoint command as positional args and others as kwargs sb = await modal.Sandbox.create.aio( *entrypoint_command, **modal_params ) - - # Set tags sb.set_tags(tags) if synchronous: - # Stream output for better user experience async for line in sb.stdout: print(line, end="") await sb.wait.aio() else: - logger.debug("Sandbox started asynchronously") - - # Store the image ID for future caching after sandbox creation - # The image should be hydrated after being used in sandbox creation - await self._store_image_id(zenml_image) - - async def _store_image_id(self, modal_image: modal.Image) -> None: - """Store the image ID for future caching after sandbox creation. - - Args: - modal_image: The Modal image that was used. - """ - try: - modal_image.hydrate() - if hasattr(modal_image, "object_id") and modal_image.object_id: - if self.deployment.build is not None: - image_name_key = f"zenml_image_{self.deployment.build.id}" - - # Store the image ID in Modal's persistent storage - pipeline_name = self.deployment.pipeline_configuration.name - stored_id = modal.Dict.from_name( - f"zenml-image-cache-{pipeline_name}", - create_if_missing=True, - ) - stored_id[image_name_key] = modal_image.object_id - logger.debug( - f"Stored Modal image ID for build {self.deployment.build.id}" - ) - else: - logger.warning( - "Deployment build is None, cannot store image ID" - ) - else: - logger.warning("Image not hydrated after sandbox creation") - except Exception as e: - logger.warning(f"Failed to store image ID: {e}") + logger.debug("Sandbox started asynchronously.") def _get_cached_or_build_image( self, step_name: Optional[str] = None diff --git a/src/zenml/integrations/modal/utils.py b/src/zenml/integrations/modal/utils.py index 465e1520929..1332760195c 100644 --- a/src/zenml/integrations/modal/utils.py +++ b/src/zenml/integrations/modal/utils.py @@ -221,11 +221,11 @@ def get_or_build_modal_image( build_item.settings_checksum or f"{build_id}-{build_item.image}" ) - try: - remote_image_cache = modal.Dict.from_name( - f"zenml-image-cache-{pipeline_name}" - ) + remote_image_cache = modal.Dict.from_name( + f"zenml-image-cache-{pipeline_name}", create_if_missing=True + ) + try: if modal_image_id := remote_image_cache.get(cache_key): existing_image = modal.Image.from_id(modal_image_id) logger.debug( @@ -235,11 +235,18 @@ def get_or_build_modal_image( except (modal.exception.NotFoundError, KeyError): pass - return build_modal_image( + new_image = build_modal_image( image_name=build_item.image, stack=stack, ) - # TODO: we should cache this here immediately? + + new_image.hydrate() + try: + remote_image_cache[cache_key] = new_image.object_id + except Exception as e: + logger.warning(f"Failed to cache image: {e}") + + return new_image def generate_sandbox_tags( From 0533723e4a4858aabcb0622695e43043057ae84a Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Tue, 29 Jul 2025 15:38:13 +0200 Subject: [PATCH 76/77] Use correct key --- .../integrations/modal/orchestrators/modal_sandbox_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py index 1f1a732a1c0..0b96a70b956 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py +++ b/src/zenml/integrations/modal/orchestrators/modal_sandbox_executor.py @@ -379,7 +379,7 @@ def _get_cached_or_build_image( return cached_image build_item = self.deployment.build._get_item( - component_key="ORCHESTRATOR", step=step_name + component_key="orchestrator", step=step_name ) return get_or_build_modal_image( From a638328354b0d1d94b072bb55e551e91277b5cee Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Tue, 29 Jul 2025 15:39:47 +0200 Subject: [PATCH 77/77] Add todo --- .../integrations/modal/orchestrators/modal_orchestrator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py index 7627557ecd9..9fd9175a294 100644 --- a/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py +++ b/src/zenml/integrations/modal/orchestrators/modal_orchestrator.py @@ -209,6 +209,8 @@ def submit_pipeline( if settings.synchronous: def _wait_for_completion() -> None: + # TODO: separate this into creating the sandbox, and + # monitoring/log streaming. async def _execute_pipeline() -> None: try: await executor.execute_pipeline(