plexe-ai · vaibs-d · Apr 20, 2025 · Apr 20, 2025 · Apr 20, 2025 · Apr 20, 2025
diff --git a/docs/gpu_support.md b/docs/gpu_support.md
@@ -0,0 +1,137 @@
+# GPU Support in Plexe
+
+This document describes how GPU support works in Plexe for accelerating machine learning model training and inference.
+
+## Overview
+
+Plexe automatically detects and utilizes available GPUs for machine learning tasks, providing acceleration for:
+- Model training
+- Feature engineering
+- Inference/prediction
+
+The GPU integration works across multiple machine learning frameworks and is designed to be transparent to users.
+
+## How It Works
+
+### 1. GPU Detection
+
+Plexe uses a multi-framework approach to detect available GPUs:
+
+```python
+# From plexe/internal/common/utils/model_utils.py
+def is_gpu_available() -> bool:
+    # Check for PyTorch GPU
+    if is_package_available("torch"):
+        import torch
+        if torch.cuda.is_available():
+            return True
+
+    # Check for TensorFlow GPU
+    if is_package_available("tensorflow"):
+        import tensorflow as tf
+        gpus = tf.config.list_physical_devices('GPU')
+        if gpus:
+            return True
+
+    return False
+```
+
+### 2. Framework-Specific Configuration
+
+For each supported ML framework, Plexe automatically configures the appropriate GPU parameters:
+
+| Framework | GPU Configuration |
+|-----------|------------------|
+| XGBoost   | `tree_method: gpu_hist`, `predictor: gpu_predictor` |
+| LightGBM  | `device: gpu` |
+| CatBoost  | `task_type: GPU` |
+| PyTorch   | `device: cuda` |
+| TensorFlow| Memory growth enabled |
+
+### 3. Ray Integration
+
+Plexe uses Ray for distributed execution, with GPU support integrated in the `RayExecutor`:
+
+1. GPU resources are explicitly requested for Ray tasks
+2. GPU detection code is injected into training scripts
+3. Memory usage is monitored throughout execution
+4. Environment variables are set for framework-specific GPU usage
+
+```python
+@ray.remote(num_gpus=1)  # Request 1 GPU for this task
+def _run_code(code: str, working_dir: str, dataset_files: List[str], timeout: int) -> dict:
+    # GPU detection and code enhancement logic
+    ...
+```
+
+### 4. Code Enhancement
+
+When running code, the executor automatically enhances it with GPU detection:
+
+```python
+# GPU detection code injected into execution
+gpu_detection_code = """
+# GPU detection for ML frameworks
+import torch
+if torch.cuda.is_available():
+    print(f"GPU available for training: {torch.cuda.get_device_name(0)}")
+    device = "cuda"
+    # Try to enable GPU for common ML frameworks
+    try:
+        import os
+        # XGBoost
+        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+        # TensorFlow
+        os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
+    except Exception as e:
+        print(f"Error setting up GPU environment: {e}")
+else:
+    print("No GPU available for training")
+    device = "cpu"
+"""
+```
+
+## Usage
+
+GPU support is transparent to users - no additional configuration is required:
+
+```python
+# GPU will be automatically used if available
+model = plexe.Model(intent="Predict the target variable")
+model.build(
+    datasets=[data],
+    provider="openai/gpt-4o-mini"
+)
+```
+
+You can monitor GPU usage with the `GPUMonitorCallback`:
+
+```python
+class GPUMonitorCallback(Callback):
+    def __init__(self):
+        self.gpu_usage = []
+
+    def on_build_start(self, build_state_info):
+        if torch.cuda.is_available():
+            memory = torch.cuda.memory_allocated() / (1024**2)
+            logger.info(f"Build start - GPU memory: {memory:.2f} MB")
+            self.gpu_usage.append(("build_start", memory))
+```
+
+## Performance Considerations
+
+- GPU acceleration typically provides the most benefit for:
+  - Deep learning models with PyTorch/TensorFlow
+  - Gradient boosting frameworks (XGBoost, LightGBM, CatBoost)
+  - Large datasets with complex feature engineering
+
+- For small datasets or simple models, the CPU may be more efficient due to the overhead of GPU data transfer.
+
+## Requirements
+
+To use GPU acceleration in Plexe:
+
+1. CUDA-compatible GPU hardware
+2. Appropriate drivers (NVIDIA drivers)
+3. CUDA toolkit and cuDNN installed
+4. GPU-enabled versions of machine learning frameworks (PyTorch, TensorFlow, XGBoost, etc.)
diff --git a/plexe/internal/agents.py b/plexe/internal/agents.py
@@ -64,7 +64,6 @@ def __init__(
         ml_ops_engineer_model_id: str = "anthropic/claude-3-7-sonnet-20250219",
         verbose: bool = False,
         max_steps: int = 30,
-        distributed: bool = False,
         chain_of_thought_callable: Optional[Callable] = None,
     ):
         """
@@ -77,7 +76,6 @@ def __init__(
             ml_ops_engineer_model_id: Model ID for the ML ops engineer agent
             verbose: Whether to display detailed agent logs
             max_steps: Maximum number of steps for the orchestrator agent
-            distributed: Whether to run the agents in a distributed environment
             chain_of_thought_callable: Optional callable for chain of thought logging
         """
         self.orchestrator_model_id = orchestrator_model_id
@@ -86,7 +84,6 @@ def __init__(
         self.ml_ops_engineer_model_id = ml_ops_engineer_model_id
         self.verbose = verbose
         self.max_steps = max_steps
-        self.distributed = distributed
         self.chain_of_thought_callable = chain_of_thought_callable
 
         # Set verbosity levels
@@ -133,7 +130,7 @@ def __init__(
                 generate_training_code,
                 validate_training_code,
                 fix_training_code,
-                get_executor_tool(distributed),
+                get_executor_tool(),
                 format_final_mle_agent_response,
             ],
             add_base_tools=False,

diff --git a/plexe/internal/common/utils/model_utils.py b/plexe/internal/common/utils/model_utils.py
@@ -2,7 +2,8 @@
 This module provides utility functions for working with model descriptions and metadata.
 """
 
-from typing import Optional
+import importlib.util
+from typing import Dict, Optional
 
 
 def calculate_model_size(artifacts: list) -> Optional[int]:
@@ -39,3 +40,94 @@ def format_code_snippet(code: Optional[str]) -> Optional[str]:
         # Return first 10 and last 10 lines with a note in the middle
         return "\n".join(lines[:10] + ["# ... additional lines omitted ..."] + lines[-10:])
     return code
+
+
+def is_package_available(package_name: str) -> bool:
+    """
+    Check if a Python package is installed and available.
+
+    :param package_name: Name of the package to check
+    :return: True if the package is available, False otherwise
+    """
+    return importlib.util.find_spec(package_name) is not None
+
+
+def is_gpu_available() -> bool:
+    """
+    Check if a GPU is available for training.
+
+    :return: True if a GPU is available, False otherwise
+    """
+    # Check for PyTorch GPU
+    if is_package_available("torch"):
+        import torch
+
+        if torch.cuda.is_available():
+            return True
+
+    # Check for TensorFlow GPU
+    if is_package_available("tensorflow"):
+        import tensorflow as tf
+
+        gpus = tf.config.list_physical_devices("GPU")
+        if gpus:
+            return True
+
+    return False
+
+
+def get_device() -> str:
+    """
+    Get the device to use for PyTorch ('cuda' or 'cpu').
+
+    :return: 'cuda' if a GPU is available, 'cpu' otherwise
+    """
+    if is_package_available("torch") and is_gpu_available():
+        import torch
+
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    return "cpu"
+
+
+def get_gpu_params(framework: str) -> Dict:
+    """
+    Get GPU-related parameters for different ML frameworks.
+
+    :param framework: Name of the ML framework ('xgboost', 'lightgbm', 'catboost', 'pytorch', 'tensorflow')
+    :return: Dictionary with appropriate GPU parameters for the specified framework
+    """
+    if not is_gpu_available():
+        return {}
+
+    framework = framework.lower()
+
+    # XGBoost GPU parameters
+    if framework == "xgboost":
+        return {"tree_method": "gpu_hist", "gpu_id": 0, "predictor": "gpu_predictor"}
+    # LightGBM GPU parameters
+    elif framework == "lightgbm":
+        return {"device": "gpu", "gpu_platform_id": 0, "gpu_device_id": 0}
+    # CatBoost GPU parameters
+    elif framework == "catboost":
+        return {"task_type": "GPU", "devices": "0"}
+    # PyTorch device
+    elif framework == "pytorch" or framework == "torch":
+        import torch
+
+        return {"device": torch.device("cuda" if torch.cuda.is_available() else "cpu")}
+    # TensorFlow GPU config
+    elif framework == "tensorflow" or framework == "tf":
+        try:
+            import tensorflow as tf
+
+            gpus = tf.config.list_physical_devices("GPU")
+            if gpus:
+                # Only enable memory growth to avoid allocating all GPU memory
+                for gpu in gpus:
+                    tf.config.experimental.set_memory_growth(gpu, True)
+                return {"gpu_enabled": True, "gpu_count": len(gpus)}
+        except Exception:
+            pass
+
+    # Default empty dict for other frameworks
+    return {}
diff --git a/plexe/internal/models/execution/ray_executor.py b/plexe/internal/models/execution/ray_executor.py
@@ -28,19 +28,59 @@
 logger = logging.getLogger(__name__)
 
 
-@ray.remote
+@ray.remote(num_cpus=1)
 def _run_code(code: str, working_dir: str, dataset_files: List[str], timeout: int) -> dict:
-    """Ray remote function that executes the code."""
+    """Ray remote function that executes the code with GPU support if available."""
     import subprocess
     import sys
     from pathlib import Path
 
     working_dir = Path(working_dir)
     code_file = working_dir / "run.py"
 
-    # Write code to file
+    # Check for GPU availability
+    gpu_available = False
+    gpu_info = {}
+
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            gpu_available = True
+            gpu_info = {
+                "device": torch.cuda.get_device_name(0),
+                "memory_start": torch.cuda.memory_allocated() / (1024**2),
+            }
+            print(f"[RayExecutor] GPU available: {gpu_info['device']}")
+    except ImportError:
+        print("[RayExecutor] PyTorch not available, cannot detect GPU")
+
+    # Enhance code with GPU detection
+    gpu_detection_code = ""
+    if gpu_available:
+        gpu_detection_code = """
+# GPU detection for ML frameworks
+import torch
+if torch.cuda.is_available():
+    print(f"GPU available for training: {torch.cuda.get_device_name(0)}")
+    device = "cuda"
+    # Try to enable GPU for common ML frameworks
+    try:
+        import os
+        # XGBoost
+        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+        # TensorFlow
+        os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
+    except Exception as e:
+        print(f"Error setting up GPU environment: {e}")
+else:
+    print("No GPU available for training")
+    device = "cpu"
+"""
+
+    # Write code to file with GPU detection if available
     with open(code_file, "w", encoding="utf-8") as f:
-        f.write("import os\nimport sys\nfrom pathlib import Path\n\n" + code)
+        f.write("import os\nimport sys\nfrom pathlib import Path\n\n" + gpu_detection_code + code)
 
     start_time = time.time()
     process = subprocess.Popen(
@@ -55,6 +95,20 @@ def _run_code(code: str, working_dir: str, dataset_files: List[str], timeout: in
         stdout, stderr = process.communicate(timeout=timeout)
         exec_time = time.time() - start_time
 
+        # Get final GPU stats if available
+        if gpu_available:
+            try:
+                import torch
+
+                if torch.cuda.is_available():
+                    gpu_info["memory_end"] = torch.cuda.memory_allocated() / (1024**2)
+                    gpu_info["memory_max"] = torch.cuda.max_memory_allocated() / (1024**2)
+                    print(
+                        f"[RayExecutor] GPU memory usage: {gpu_info['memory_end']:.2f} MB, max: {gpu_info['memory_max']:.2f} MB"
+                    )
+            except Exception as e:
+                print(f"[RayExecutor] Error getting GPU stats: {e}")
+
         # Collect model artifacts
         model_artifacts = []
         model_dir = working_dir / "model_files"
@@ -71,6 +125,7 @@ def _run_code(code: str, working_dir: str, dataset_files: List[str], timeout: in
             "returncode": process.returncode,
             "exec_time": exec_time,
             "model_artifacts": model_artifacts,
+            "gpu_info": gpu_info if gpu_available else {"available": False},
         }
     except subprocess.TimeoutExpired:
         process.kill()
@@ -80,6 +135,7 @@ def _run_code(code: str, working_dir: str, dataset_files: List[str], timeout: in
             "returncode": -1,
             "exec_time": timeout,
             "model_artifacts": [],
+            "gpu_info": gpu_info if gpu_available else {"available": False},
         }
 
 
@@ -146,7 +202,7 @@ def run(self) -> ExecutionResult:
             if not ready_refs:
                 ray.cancel(result_ref, force=True)
                 return ExecutionResult(
-                    term_out=[],
+                    term_out=["Execution timed out"],
                     exec_time=self.timeout,
                     exception=TimeoutError(f"Execution exceeded {self.timeout}s timeout - Ray timeout reached"),
                 )
@@ -172,7 +228,7 @@ def run(self) -> ExecutionResult:
         except ray.exceptions.GetTimeoutError:
             ray.cancel(result_ref, force=True)
             return ExecutionResult(
-                term_out=[],
+                term_out=["Execution timed out due to GetTimeoutError"],
                 exec_time=self.timeout,
                 exception=TimeoutError(f"Execution exceeded {self.timeout}s timeout - Ray timeout reached"),
             )