Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Need support for installing bitsandbytes on Cuda121 #1479

Closed
mahdavis opened this issue Jan 21, 2025 · 1 comment
Closed

Need support for installing bitsandbytes on Cuda121 #1479

mahdavis opened this issue Jan 21, 2025 · 1 comment

Comments

@mahdavis
Copy link

Hi everyone,
I'm trying to install bitsandbytes on SageMaker but running into some issues. The instance has CUDA 12.4 installed, and I'm trying to install bitsandbytes with CUDA 12.1 compatibility (since it's more stable for bitsandbytes).

Steps are:

# 1. Removed existing installations
pip uninstall -y bitsandbytes

# 2. Set up CUDA environment
CUDA_HOME="/root/local/cuda-12.1"
LD_LIBRARY_PATH="/root/local/cuda-12.1/lib64:$LD_LIBRARY_PATH"
BNB_CUDA_VERSION="121"

# 3. Cloned and installed from source
git clone https://github.com/timdettmers/bitsandbytes.git
cd bitsandbytes
pip install -e .

The LD_LIBRARY_PATH seems correct:

/root/local/cuda-12.1/lib64:/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/home/.openmpi/lib/

However, I'm getting errors that suggest the CUDA libraries aren't being found:

Missing: /root/local/cuda-12.1/lib64/libcudart.so
Missing: /root/local/cuda-12.1/lib64/libcuda.so

The final error is: ModuleNotFoundError: No module named 'bitsandbytes'

Has anyone successfully installed bitsandbytes on SageMaker? Any suggestions on what I might be missing?

Thank you!
Full Code:

import subprocess
import os
import sys
from pathlib import Path
import re
import requests
import json  
def run_command(command, description=None):
    """Execute a command and handle potential errors"""
    try:
        if description:
            print(f"\n{description}...")
        result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
        print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error executing command: {e.cmd}")
        print(f"Error output: {e.stderr}")
        return False

def cleanup_existing_installations():
    """Clean up existing PyTorch and CUDA packages"""
    packages_to_remove = [
        "torch",
        "torchvision",  # Added
        "torchaudio",   # Added
        "torchtext",    # Added
        "triton",
        "nvidia-cublas-cu12",
        "nvidia-cuda-cupti-cu12",
        "nvidia-cuda-nvrtc-cu12",
        "nvidia-cuda-runtime-cu12",
        "nvidia-cudnn-cu12",
        "nvidia-cufft-cu12",
        "nvidia-curand-cu12",
        "nvidia-cusolver-cu12",
        "nvidia-cusparse-cu12",
        "nvidia-nccl-cu12",
        "nvidia-nvtx-cu12"
    ]
    
    for package in packages_to_remove:
        run_command(f"pip uninstall -y {package}", f"Removing {package}")
    
    # Clear pip cache
    run_command("pip cache purge", "Clearing pip cache")    
def get_cuda_version():
    """
    Detect CUDA version from nvidia-smi and system
    Returns tuple of (major_version, minor_version) or None if not found
    """
    try:
        # Try to get CUDA version from nvidia-smi
        nvidia_smi = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, 
                                  stderr=subprocess.PIPE, text=True)
        
        if nvidia_smi.returncode == 0:
            # Extract CUDA version from nvidia-smi output
            match = re.search(r'CUDA Version: (\d+)\.(\d+)', nvidia_smi.stdout)
            if match:
                return (int(match.group(1)), int(match.group(2)))
            
        # If nvidia-smi fails or doesn't show CUDA version, try nvcc
        nvcc = subprocess.run(['nvcc', '--version'], stdout=subprocess.PIPE, 
                            stderr=subprocess.PIPE, text=True)
        
        if nvcc.returncode == 0:
            match = re.search(r'release (\d+)\.(\d+)', nvcc.stdout)
            if match:
                return (int(match.group(1)), int(match.group(2)))
                
        # As a last resort, check CUDA_HOME environment variable
        cuda_home = os.environ.get('CUDA_HOME')
        if cuda_home:
            match = re.search(r'cuda-(\d+)\.(\d+)', cuda_home)
            if match:
                return (int(match.group(1)), int(match.group(2)))
                
    except Exception as e:
        print(f"Error detecting CUDA version: {e}")
    
    return None


def get_compatible_cuda_version(cuda_version):
    """
    Map detected CUDA version to compatible version for installation
    Returns version code used in installation commands
    """
    if cuda_version is None:
        print("Could not detect CUDA version.")
        return None
        
    major, minor = cuda_version
    
    # For CUDA 12.x, always use 12.1 packages as they're stable and widely available
    if major == 12:
        print(f"Detected CUDA {major}.{minor}, using CUDA 12.1 packages for compatibility")
        return '121'
    
    # For older versions
    cuda_map = {
        (11, 8): '118',
        (11, 7): '117',
        (11, 6): '116'
    }
    
    # Find the closest compatible version
    for (comp_major, comp_minor), version_code in sorted(cuda_map.items(), reverse=True):
        if major > comp_major or (major == comp_major and minor >= comp_minor):
            return version_code
            
    print("Warning: Your CUDA version might be too old. Using oldest supported version.")
    return '116'



def get_compatible_torch_version(cuda_version_code):
    """
    Determine compatible PyTorch version based on CUDA version
    Returns tuple of (torch_version, is_stable)
    """
    # Updated map of CUDA versions to compatible PyTorch versions
    # Using more conservative version mapping that's known to work
    cuda_torch_map = {
        '124': ['2.1.0'],          # CUDA 12.4 -> Use 2.1.0 as it's stable
        '123': ['2.1.0'],          # CUDA 12.3
        '122': ['2.1.0'],          # CUDA 12.2
        '121': ['2.1.0'],          # CUDA 12.1
        '120': ['2.1.0'],          # CUDA 12.0
        '118': ['2.0.1'],          # CUDA 11.8
        '117': ['2.0.1'],          # CUDA 11.7
        '116': ['1.13.1'],         # CUDA 11.6
        '115': ['1.13.1'],         # CUDA 11.5
        '113': ['1.13.1']          # CUDA 11.3
    }
    
    try:
        # Try to get available PyTorch versions from PyPI
        response = requests.get("https://pypi.org/pypi/torch/json")
        if response.status_code == 200:
            available_versions = set(json.loads(response.text)['releases'].keys())
            # Filter compatible versions that are actually available
            compatible_versions = [v for v in cuda_torch_map.get(cuda_version_code, [])
                                 if v in available_versions]
            if compatible_versions:
                return compatible_versions[0], True
    except Exception as e:
        print(f"Warning: Could not fetch PyTorch versions from PyPI: {e}")
    
    # Fallback to default version for the detected CUDA version
    default_version = cuda_torch_map.get(cuda_version_code, ['2.1.0'])[0]
    return default_version, False


def install_dependencies(cuda_version_code):
    """Install PyTorch and required CUDA dependencies"""
    # Get compatible PyTorch version
    torch_version, is_stable = get_compatible_torch_version(cuda_version_code)
    print(f"Installing PyTorch version {torch_version} for CUDA {cuda_version_code}")
    
    # Install PyTorch ecosystem with CUDA support - this will bring CUDA dependencies
    torch_command = (
        f"pip install --no-cache-dir "
        f"torch=={torch_version}+cu{cuda_version_code} "
        f"torchvision "
        f"torchaudio "
        f"--index-url https://download.pytorch.org/whl/cu{cuda_version_code}"
    )
    
    if not run_command(torch_command, "Installing PyTorch ecosystem"):
        return False

    # Install other dependencies
    other_packages = [
        "transformers==4.44.2",
        "peft==0.12.0",
        "accelerate==0.33.0",
        "trl==0.9.6",
        "timm"
    ]
    
    for package in other_packages:
        if not run_command(f"pip install --no-cache-dir {package}", f"Installing {package}"):
            return False
    
    return True

def verify_cuda_installation():
    """Verify CUDA and cuDNN installation"""
    try:
        import torch
        print(f"PyTorch version: {torch.__version__}")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"CUDA version: {torch.version.cuda}")
            print(f"cuDNN version: {torch.backends.cudnn.version()}")
            print(f"GPU device name: {torch.cuda.get_device_name(0)}")
        return True
    except Exception as e:
        print(f"Error verifying CUDA installation: {e}")
        return False
    
def verify_pytorch():
    """Verify PyTorch installation and CUDA availability"""
    verify_cmd = """python -c "import torch; print('PyTorch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available()); print('CUDA version:', torch.version.cuda if torch.cuda.is_available() else 'N/A')" """
    return run_command(verify_cmd, "Verifying PyTorch installation")

def install_cuda(cuda_version_code):
    """Install CUDA using local installation script"""
    home_dir = str(Path.home())
    # cuda_install_path = f"{home_dir}/local"
    cuda_install_path = f"{home_dir}/local/cuda-{cuda_version_code[:2]}.{cuda_version_code[2]}"
    # Check if cuda_install.sh exists in current directory
    cuda_install_script_path = "/opt/ml/model/code/install_cuda.sh"
    if not os.path.exists(cuda_install_script_path):
        print("Error: install_cuda.sh not found in current directory")
        print("Please ensure install_cuda.sh is in the same directory as this script")
        return False
    
    # Install CUDA with provided version
    if not run_command(
        f"bash {cuda_install_script_path} {cuda_version_code} {cuda_install_path} 1",
        f"Installing CUDA {cuda_version_code}"
    ):
        return False
    
    return True



def install_bitsandbytes(cuda_version_code):
    """Clone and install bitsandbytes with proper CUDA support"""
    build_dir = "/tmp/bitsandbytes_build"
    
    # First uninstall existing bitsandbytes
    run_command("pip uninstall -y bitsandbytes", "Removing existing bitsandbytes")
    
    # Clean any existing build directory
    if os.path.exists(build_dir):
        run_command(f"rm -rf {build_dir}", "Cleaning up existing build directory")
    
    home_dir = str(Path.home())
    cuda_home = str(Path(home_dir) / "local" / f"cuda-{cuda_version_code[:2]}.{cuda_version_code[2]}")
    cuda_lib = str(Path(cuda_home) / "lib64")
    ld_library_path = f"{cuda_lib}:{os.environ.get('LD_LIBRARY_PATH', '')}"
    global_paths = {
        "CUDA_HOME": cuda_home,
        "LD_LIBRARY_PATH": ld_library_path,
        "BNB_CUDA_VERSION": cuda_version_code
        }
    # Clone repository to /tmp
    if not run_command(
        f"git clone https://github.com/timdettmers/bitsandbytes.git {build_dir}",
        "Cloning bitsandbytes repository"
    ):
        return False,global_paths

    original_dir = os.getcwd()
    
    try:
        os.chdir(build_dir)
        
        
        
        # Set environment variables
        os.environ.update(global_paths)
        print(f"Environment variables set:")
        print(f"CUDA_HOME: {os.environ['CUDA_HOME']}")
        print(f"LD_LIBRARY_PATH: {os.environ['LD_LIBRARY_PATH']}")
        print(f"BNB_CUDA_VERSION: {os.environ['BNB_CUDA_VERSION']}")
        
        # Install using pip install -e .
        if not run_command("pip install -e .", "Installing bitsandbytes"):
            return False,global_paths
            
        # Try to install to a specific location that's in Python's path
        if not run_command(
            f"pip install -e . --prefix=/opt/conda", 
            "Installing bitsandbytes to conda environment"
        ):
            return False,global_paths
            
        verify_cuda_setup()
        
        return True,global_paths
        
    except Exception as e:
        print(f"Error during bitsandbytes installation: {str(e)}")
        return False,global_paths
    finally:
        os.chdir(original_dir)
        run_command(f"rm -rf {build_dir}", "Cleaning up build directory")        

def verify_cuda_setup():
    # """Verify CUDA setup and print diagnostic information"""
    # cuda_paths = [
    #     "/usr/local/cuda/lib64/libcudart.so",
    #     "/usr/local/cuda/lib64/libcuda.so"
    # ]
    """Verify CUDA setup and print diagnostic information"""
    cuda_home = os.environ.get('CUDA_HOME', '/usr/local/cuda')
    cuda_paths = [
        str(Path(cuda_home) / "lib64" / "libcudart.so"),
        str(Path(cuda_home) / "lib64" / "libcuda.so")
    ]
    print("\nCUDA Setup Verification:")
    print(f"CUDA_HOME: {os.environ.get('CUDA_HOME', 'Not set')}")
    print(f"LD_LIBRARY_PATH: {os.environ.get('LD_LIBRARY_PATH', 'Not set')}")
    
    for path in cuda_paths:
        if os.path.exists(path):
            print(f"Found: {path}")
        else:
            print(f"Missing: {path}")

def try_pip_install_bitsandbytes(cuda_version_code):
    """Try to install bitsandbytes via pip as a fallback"""
    major_version = int(cuda_version_code[:2])
    if major_version >= 12:
        return run_command(
            "pip install --no-cache-dir --target /tmp/pip_target bitsandbytes>=0.41.1 && "
            "pip install --no-cache-dir --no-deps -r /tmp/pip_target/bitsandbytes-0.41.1.dist-info/METADATA",
            "Installing bitsandbytes via pip"
        )
    return False


def verify_installation():
    """Verify bitsandbytes installation"""
    verify_cmd = (
        "python -c \"import sys; print('Python path:', sys.path); "
        "import bitsandbytes; print('BitsAndBytes version:', bitsandbytes.__version__); "
        "print('BitsAndBytes path:', bitsandbytes.__file__)\""
    )
    return run_command(verify_cmd, "Verifying bitsandbytes installation")



def save_environment_config(cuda_home=None, ld_library_path=None, bnb_cuda_version=None):
    """Save environment configuration to JSON using actual installed paths
    
    Args:
        cuda_home (str): Path to CUDA installation
        ld_library_path (str): Library path for CUDA
        bnb_cuda_version (str): CUDA version for bitsandbytes
    """
    import json
    
    # Get from current environment if not provided
    if cuda_home is None:
        cuda_home = os.environ.get('CUDA_HOME', '/root/local/cuda-12.1')
    
    if ld_library_path is None:
        ld_library_path = os.environ.get('LD_LIBRARY_PATH', f"{cuda_home}/lib64:/usr/local/cuda/lib64")
    
    if bnb_cuda_version is None:
        bnb_cuda_version = os.environ.get('BNB_CUDA_VERSION', '121')
    
    config = {
        "CUDA_HOME": cuda_home,
        "LD_LIBRARY_PATH": ld_library_path,
        "BNB_CUDA_VERSION": bnb_cuda_version
    }
    
    config_path = "/opt/ml/model/code/env_config.json"
    with open(config_path, "w") as f:
        json.dump(config, f)
    print(f"Saved environment config to {config_path}: {config}")


    
def setup_install_bnb():
    print("Starting bitsandbytes installation process...")
    
    # Detect CUDA version
    cuda_version = get_cuda_version()
    if cuda_version:
        print(f"Detected CUDA version: {cuda_version[0]}.{cuda_version[1]}")
    else:
        print("Could not detect CUDA version")
        sys.exit(1)
    
    # Get compatible version code for installation
    cuda_version_code = get_compatible_cuda_version(cuda_version)
    if not cuda_version_code:
        sys.exit(1)
    
    print(f"Using CUDA version code {cuda_version_code} for installation")
    
    # Set up CUDA environment variables
    # set_cuda_environment()
    
    steps = [
        (lambda: install_dependencies(cuda_version_code), "Installing dependencies"),
        (lambda: install_cuda(cuda_version_code), "Installing CUDA"),
        (lambda: install_bitsandbytes(cuda_version_code), "Installing bitsandbytes"),
        (verify_installation, "Verifying installation")
    ]
    
    
    for step_func, description in steps:
        print(f"\n=== {description} ===")
        result = step_func()
        
        if description == "Installing bitsandbytes":
            success, paths = result
            if success:
                installation_paths = paths
            else:
                print(f"\nAttempting fallback pip installation...")
                if try_pip_install_bitsandbytes(cuda_version_code):
                    print("Fallback installation successful!")
                    continue
        elif not result:
            print(f"\nError during {description}. Installation failed.")
            sys.exit(1)
    
    if installation_paths:
        # Save environment configuration with actual paths from installation
        save_environment_config(**installation_paths)
    else:
        print("Warning: No installation paths captured, using default values")
        # save_environment_config(cuda_version_code=cuda_version_code)

    verify_bitsandbytes_installation()
    # save_environment_config()
    # setup_environment()
    print("\nInstallation completed successfully!")
    
def verify_bitsandbytes_installation():
    """Search for libbitsandbytes_cuda121.so and update the environment."""
    file_name = "libbitsandbytes_cuda121.so"
    search_paths = ["/usr/local/lib", "/usr/lib", "/opt/conda/lib"]

    for base_path in search_paths:
        base_path = Path(base_path)
        if base_path.exists():
            # Look for the file in the directory tree
            matching_files = list(base_path.rglob(file_name))
            if matching_files:
                bnb_path = matching_files[0].parent  # Get the parent directory of the file
                print(f"Found {file_name} in: {bnb_path}")
                os.environ["LD_LIBRARY_PATH"] = os.environ.get("LD_LIBRARY_PATH", "") + f":{bnb_path}"
                print(f"Updated LD_LIBRARY_PATH: {os.environ['LD_LIBRARY_PATH']}")
                return

    print(f"Error: {file_name} not found in any of the search paths. Ensure bitsandbytes is correctly installed.")
    sys.exit(1)
if __name__ == "__main__":
    setup_install_bnb()

@mahdavis
Copy link
Author

Solved the issue by adding this in my python script os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64'
os.environ['CUDA_HOME'] = '/usr/local/cuda', then start installing bitsandbytes

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant