Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bitsandbytes Installation Hanging During Make Process on SageMaker (CUDA 12.4) #1480

Closed
mahdavis opened this issue Jan 21, 2025 · 1 comment
Labels

Comments

@mahdavis
Copy link

Hi everyone,

I'm trying to install bitsandbytes on Amazon SageMaker with CUDA 12.4. The installation process hangs during the make step. Here are the details:
Environment:

Platform: Amazon SageMaker
CUDA Version: 12.4.131
GCC Version: GNU 11.4.0
CMake: Successfully configured

Installation Steps Being Executed:

Installing build prerequisites (build-essential, cmake)
Removing any existing bitsandbytes installation
Cloning bitsandbytes repository to /tmp/bitsandbytes_build
Configuring with CMake using -DCOMPUTE_BACKEND=cuda
Attempting to build with make VERBOSE=1

Current Status:

CMake configuration completed successfully with the following capabilities:

CUDA Capabilities Available: 50;52;53;60;61;62;70;72;75;80;86;87;89;90
CUDA NVCC Flags: --use_fast_math
Build files successfully written to /tmp/bitsandbytes_build

Installation hangs at the "Building with Make..." step
No error messages are displayed; the process simply stops responding

Issue:
The installation process successfully completes the CMake configuration but appears to hang indefinitely during the make process. The last log entry shows "Building with Make..." with no further output or error messages.
Questions:

Are there known issues with building on SageMaker with CUDA 12.4?
Are there any specific make flags or configurations needed for SageMaker environments?
Is there a way to get more verbose output during the make process to identify where it's hanging?

Could you provide any guidance on how to proceed or what additional information would be helpful for debugging this issue?

Thank you!

The full code:

import subprocess
import os
import sys
from pathlib import Path
import re

def run_command(command, description=None, timeout=300):  # 5 minutes timeout
    """Execute a command and handle potential errors"""
    try:
        if description:
            print(f"\n{description}...")
        print(f"Executing command: {command}")  # Print the actual command
        
        result = subprocess.run(
            command, 
            shell=True, 
            check=True, 
            capture_output=True, 
            text=True,
            timeout=timeout
        )
        print(result.stdout)
        return True
    except subprocess.TimeoutExpired:
        print(f"Command timed out after {timeout} seconds")
        return False
    except subprocess.CalledProcessError as e:
        print(f"Error executing command: {e.cmd}")
        print(f"Error output: {e.stderr}")
        return False
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
        return False

def install_bitsandbytes():
    """Install bitsandbytes with verbose build output"""
    build_dir = "/tmp/bitsandbytes_build"
    
    try:
        # 1. Install build prerequisites
        if not run_command("apt-get install -y build-essential cmake", "Installing build prerequisites"):
            return False
            
        # 2. Remove any existing installation
        run_command("pip uninstall -y bitsandbytes", "Removing existing bitsandbytes")
        
        # Clean build directory
        if os.path.exists(build_dir):
            run_command(f"rm -rf {build_dir}", "Cleaning up existing build directory")
        
        # 3. Clone repository
        if not run_command(
            f"git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git {build_dir}",
            "Cloning bitsandbytes repository"
        ):
            return False

        original_dir = os.getcwd()
        os.chdir(build_dir)
        
        try:
            # 4. Build using CMake with verbose output
            if not run_command("cmake -DCOMPUTE_BACKEND=cuda -S .", "Configuring with CMake"):
                return False
                
            # 5. Make with verbose output
            if not run_command("make VERBOSE=1", "Building with Make"):
                # If verbose make fails, try regular make
                print("Retrying with regular make...")
                if not run_command("make", "Building with Make (retry)"):
                    return False
                
            # 6. Install the package
            if not run_command("pip install -e .", "Installing bitsandbytes"):
                return False
                
            return True
            
        finally:
            os.chdir(original_dir)
            
    except Exception as e:
        print(f"Error during bitsandbytes installation: {str(e)}")
        return False
    finally:
        if os.path.exists(build_dir):
            run_command(f"rm -rf {build_dir}", "Cleaning up build directory")

# Add this new function to check build environment
def check_build_environment():
    """Check the build environment for required components"""
    checks = [
        ("cmake --version", "CMake"),
        ("gcc --version", "GCC"),
        ("nvcc --version", "NVIDIA CUDA Compiler"),
        ("nvidia-smi", "NVIDIA System Management Interface")
    ]
    
    print("\nChecking build environment:")
    for cmd, name in checks:
        print(f"\nChecking {name}...")
        try:
            result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
            if result.returncode == 0:
                print(f"{name} found:")
                print(result.stdout.strip())
            else:
                print(f"{name} check failed:")
                print(result.stderr.strip())
        except Exception as e:
            print(f"Error checking {name}: {str(e)}")

def get_cuda_version():
    """Detect CUDA version from nvidia-smi"""
    try:
        nvidia_smi = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, 
                                  stderr=subprocess.PIPE, text=True)
        
        if nvidia_smi.returncode == 0:
            match = re.search(r'CUDA Version: (\d+)\.(\d+)', nvidia_smi.stdout)
            if match:
                return (int(match.group(1)), int(match.group(2)))
    except Exception as e:
        print(f"Error detecting CUDA version: {e}")
    return None

def install_bnb_packages():
    # Check build environment first
    check_build_environment()
    
    # Check CUDA version
    cuda_version = get_cuda_version()
    if cuda_version:
        print(f"Detected CUDA version: {cuda_version[0]}.{cuda_version[1]}")
    else:
        print("Warning: Could not detect CUDA version")
        
    # Install bitsandbytes
    print("\nInstalling bitsandbytes...")
    if not install_bitsandbytes():
        print("Failed to install bitsandbytes")
        sys.exit(1)
        
    # Verify installation
    if not verify_installation():
        print("Bitsandbytes installation verification failed")
        sys.exit(1)
    verify_bitsandbytes_installation()
    print("\nInstallation completed successfully!")
    
def verify_installation():
    """Verify bitsandbytes installation"""
    checks = [
        "import bitsandbytes as bnb",
        "print('BitsAndBytes version:', bnb.__version__)",
        "print('CUDA available:', bnb.CUDA_AVAILABLE)",
        "print('BitsAndBytes path:', bnb.__file__)"
    ]
    verify_cmd = "python -c \"{}\"".format("; ".join(checks))
    return run_command(verify_cmd, "Verifying bitsandbytes installation")

def verify_bitsandbytes_installation():
    """Search for libbitsandbytes_cuda121.so and update the environment."""
    file_name = "libbitsandbytes_cuda121.so"
    search_paths = ["/usr/local/lib", "/usr/lib", "/opt/conda/lib"]

    for base_path in search_paths:
        base_path = Path(base_path)
        if base_path.exists():
            # Look for the file in the directory tree
            matching_files = list(base_path.rglob(file_name))
            if matching_files:
                bnb_path = matching_files[0].parent  # Get the parent directory of the file
                print(f"Found {file_name} in: {bnb_path}")
                os.environ["LD_LIBRARY_PATH"] = os.environ.get("LD_LIBRARY_PATH", "") + f":{bnb_path}"
                print(f"Updated LD_LIBRARY_PATH: {os.environ['LD_LIBRARY_PATH']}")
                return

    print(f"Error: {file_name} not found in any of the search paths. Ensure bitsandbytes is correctly installed.")
    sys.exit(1)
    
if __name__ == "__main__":
     install_bnb_packages()


@mahdavis
Copy link
Author

Solved the issue by adding this in my python script os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64'
os.environ['CUDA_HOME'] = '/usr/local/cuda', then start installing bitsandbytes

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants