diff --git a/Cargo.lock b/Cargo.lock index 5280666..107cbe1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -599,6 +599,8 @@ dependencies = [ "qp-poseidon-core", "rand 0.9.2", "rand_chacha 0.9.0", + "regex", + "tokio", "wgpu", ] diff --git a/crates/engine-gpu/Cargo.toml b/crates/engine-gpu/Cargo.toml index 2996cb8..8132411 100644 --- a/crates/engine-gpu/Cargo.toml +++ b/crates/engine-gpu/Cargo.toml @@ -27,17 +27,20 @@ qp-poseidon-constants = "1.1" qp-plonky2-field = { version = "1.1.1" } plonky2 = { package = "qp-plonky2", version = "1.1.3" } -wgpu = { version = "27.0.1" } # GPU compute library -futures = "0.3" # For async executor -bytemuck = "1.16" # For buffer mapping +wgpu = { version = "27.0.1" } # GPU compute library +tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time"] } # Async runtime with timeout support +futures = "0.3" # For async oneshot channels in test modules +bytemuck = "1.16" # For buffer mapping rand = { workspace = true, features = ["std", "std_rng"] } -rand_chacha = "0.9" # For deterministic random test generation +rand_chacha = "0.9" # For deterministic random test generation +regex = "1" # For GPU name pattern matching [dev-dependencies] hex = { workspace = true } criterion = "0.5" rand = { workspace = true, features = ["std", "std_rng"] } env_logger.workspace = true +tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "time"] } # Add macros for #[tokio::main] [[bench]] name = "gpu_engine_bench" diff --git a/crates/engine-gpu/src/gpu_tiers.rs b/crates/engine-gpu/src/gpu_tiers.rs new file mode 100644 index 0000000..3089d73 --- /dev/null +++ b/crates/engine-gpu/src/gpu_tiers.rs @@ -0,0 +1,723 @@ +//! GPU tier detection using regex patterns +//! +//! This module provides a table-driven approach to GPU detection, +//! using regex patterns with word boundaries to avoid substring matching issues. + +use regex::Regex; +use std::sync::LazyLock; + +/// GPU tier configuration +struct GpuTier { + /// Regex pattern to match GPU name (case-insensitive) + pattern: &'static str, + /// Human-readable tier name + name: &'static str, + /// Divisor for max_workgroups (higher = more conservative) + workgroup_divisor: u32, + /// Minimum workgroups to use + min_workgroups: u32, +} + +/// Compiled GPU tier with regex +struct CompiledGpuTier { + regex: Regex, + name: &'static str, + workgroup_divisor: u32, + min_workgroups: u32, +} + +impl CompiledGpuTier { + fn from_tier(tier: &GpuTier) -> Self { + Self { + regex: Regex::new(&format!("(?i){}", tier.pattern)).expect("Invalid GPU tier regex"), + name: tier.name, + workgroup_divisor: tier.workgroup_divisor, + min_workgroups: tier.min_workgroups, + } + } +} + +// GPU tiers are checked in order - first match wins +// Use word boundaries (\b) to avoid substring issues like "550" matching "5500" + +const NVIDIA_TIERS: &[GpuTier] = &[ + // Blackwell (RTX 50 series) + GpuTier { + pattern: r"\b50[89]0\b", + name: "NVIDIA RTX 50 Flagship (Blackwell)", + workgroup_divisor: 6, + min_workgroups: 5120, + }, + GpuTier { + pattern: r"\b50[67]0\b|rtx 50", + name: "NVIDIA RTX 50 (Blackwell)", + workgroup_divisor: 7, + min_workgroups: 4608, + }, + // Ada Lovelace (RTX 40 series) + GpuTier { + pattern: r"\b40[89]0\b", + name: "NVIDIA RTX 40 Flagship (Ada)", + workgroup_divisor: 8, + min_workgroups: 4096, + }, + GpuTier { + pattern: r"\b40[67]0\b|rtx 40", + name: "NVIDIA RTX 40 (Ada)", + workgroup_divisor: 10, + min_workgroups: 3072, + }, + // Ampere/Turing (RTX 30/20 series) + GpuTier { + pattern: r"\b30[5-9]0\b|\b20[6-8]0\b|rtx 30|rtx 20", + name: "NVIDIA RTX 30/20 (Ampere/Turing)", + workgroup_divisor: 12, + min_workgroups: 2048, + }, + // Turing/Pascal (GTX 16/10 series) + GpuTier { + pattern: r"\b16[56]0\b|\b10[3-8]0\b|gtx 16|gtx 10", + name: "NVIDIA GTX 16/10 (Turing/Pascal)", + workgroup_divisor: 16, + min_workgroups: 1024, + }, + // Maxwell (GTX 900 series) + GpuTier { + pattern: r"\b9[5-8]0\b|gtx 9", + name: "NVIDIA GTX 900 (Maxwell)", + workgroup_divisor: 18, + min_workgroups: 768, + }, + // Kepler/Maxwell (GTX 700 series) + GpuTier { + pattern: r"\b7[5-8]0\b|gtx 7", + name: "NVIDIA GTX 700 (Kepler/Maxwell)", + workgroup_divisor: 20, + min_workgroups: 512, + }, + // Legacy GTX + GpuTier { + pattern: r"gtx [456]", + name: "NVIDIA GTX Legacy (Fermi/Kepler)", + workgroup_divisor: 24, + min_workgroups: 384, + }, + GpuTier { + pattern: r"\bgtx\b", + name: "NVIDIA GTX (Unknown)", + workgroup_divisor: 20, + min_workgroups: 512, + }, + // Mobile + GpuTier { + pattern: r"\bmx[1-5]\d0|geforce mx", + name: "NVIDIA MX (Mobile)", + workgroup_divisor: 24, + min_workgroups: 384, + }, + GpuTier { + pattern: r"geforce gt\b|\bgt \d{3}\b", + name: "NVIDIA GT (Entry-Level)", + workgroup_divisor: 28, + min_workgroups: 256, + }, + // Professional + GpuTier { + pattern: r"quadro|rtx a\d|tesla|\ba100\b|\bh100\b|\bl4\b", + name: "NVIDIA Quadro/Professional", + workgroup_divisor: 10, + min_workgroups: 2560, + }, +]; + +const AMD_TIERS: &[GpuTier] = &[ + // RDNA 4 (RX 9000 series) + GpuTier { + pattern: r"rx 9|\b90[78]0\b", + name: "AMD RX 9000 (RDNA 4)", + workgroup_divisor: 8, + min_workgroups: 4096, + }, + // RDNA 3 Discrete + GpuTier { + pattern: r"\b7900\b", + name: "AMD RX 7900 (RDNA 3 Flagship)", + workgroup_divisor: 9, + min_workgroups: 3584, + }, + GpuTier { + pattern: r"rx 7|\b7[6-8]00\b", + name: "AMD RX 7000 (RDNA 3)", + workgroup_divisor: 10, + min_workgroups: 3072, + }, + // RDNA 3 APUs - check before discrete to avoid substring match + GpuTier { + pattern: r"\b780m\b|radeon 780m", + name: "AMD Radeon 780M (RDNA 3 APU)", + workgroup_divisor: 12, + min_workgroups: 2048, + }, + GpuTier { + pattern: r"\b7[46]0m\b|radeon 7[46]0m", + name: "AMD Radeon 7x0M (RDNA 3 APU)", + workgroup_divisor: 16, + min_workgroups: 1024, + }, + // RDNA 2 Discrete + GpuTier { + pattern: r"\b6[89][05]0\b", + name: "AMD RX 6900/6800 (RDNA 2 Flagship)", + workgroup_divisor: 12, + min_workgroups: 2560, + }, + GpuTier { + pattern: r"\b6[67][05]0\b", + name: "AMD RX 6700/6600 (RDNA 2)", + workgroup_divisor: 14, + min_workgroups: 2048, + }, + GpuTier { + pattern: r"\b6[45]00\b", + name: "AMD RX 6500/6400 (RDNA 2 Entry)", + workgroup_divisor: 22, + min_workgroups: 512, + }, + GpuTier { + pattern: r"rx 6\d{3}", + name: "AMD RX 6000 (RDNA 2)", + workgroup_divisor: 14, + min_workgroups: 2048, + }, + // RDNA 2 APUs - check before discrete + GpuTier { + pattern: r"\b680m\b|radeon 680m", + name: "AMD Radeon 680M (RDNA 2 APU)", + workgroup_divisor: 16, + min_workgroups: 1536, + }, + GpuTier { + pattern: r"\b6[16]0m\b|radeon 6[16]0m", + name: "AMD Radeon 6x0M (RDNA 2 APU)", + workgroup_divisor: 22, + min_workgroups: 768, + }, + // RDNA 1 (RX 5000 series) - 4-digit patterns to avoid matching Polaris 3-digit + GpuTier { + pattern: r"\b5700\b", + name: "AMD RX 5700 (RDNA 1)", + workgroup_divisor: 16, + min_workgroups: 1536, + }, + GpuTier { + pattern: r"\b5[56]00\b|rx 5\d{3}", + name: "AMD RX 5000 (RDNA 1)", + workgroup_divisor: 18, + min_workgroups: 1024, + }, + // Polaris (RX 400/500 series) - 3-digit models with boundaries + GpuTier { + pattern: r"rx [45]\d0\b|\b[45][6-9]0\b|\b590\b|rx 5.0x", + name: "AMD RX 500/400 (Polaris)", + workgroup_divisor: 20, + min_workgroups: 768, + }, + // Vega + GpuTier { + pattern: r"radeon vii\b", + name: "AMD Radeon VII (Vega 20)", + workgroup_divisor: 12, + min_workgroups: 2048, + }, + GpuTier { + pattern: r"vega\s*64", + name: "AMD Vega 64 (Discrete)", + workgroup_divisor: 14, + min_workgroups: 1536, + }, + GpuTier { + pattern: r"vega\s*56", + name: "AMD Vega 56 (Discrete)", + workgroup_divisor: 16, + min_workgroups: 1280, + }, + GpuTier { + pattern: r"vega", + name: "AMD Vega (APU)", + workgroup_divisor: 28, + min_workgroups: 384, + }, + // GCN + GpuTier { + pattern: r"fury|nano", + name: "AMD R9 Fury/Nano (Fiji)", + workgroup_divisor: 16, + min_workgroups: 1280, + }, + GpuTier { + pattern: r"r9.*(3[89]0|2[89]0)|\b[23][89]0x?\b", + name: "AMD R9 (GCN)", + workgroup_divisor: 20, + min_workgroups: 768, + }, + GpuTier { + pattern: r"r7.*(3[67]0|2[67]0)|\b[23][67]0x?\b", + name: "AMD R7 (GCN)", + workgroup_divisor: 22, + min_workgroups: 512, + }, + // Professional + GpuTier { + pattern: r"radeon pro|instinct mi|mi[123]\d0|firepro|w[567]\d{3}", + name: "AMD Radeon Pro/Instinct", + workgroup_divisor: 10, + min_workgroups: 2560, + }, + // OEM/APU fallbacks + GpuTier { + pattern: r"radeon\s*\(tm\)\s*[67]\d{2}\b|radeon [67]\d{2}\b", + name: "AMD Radeon OEM (Polaris Rebrand)", + workgroup_divisor: 24, + min_workgroups: 512, + }, + GpuTier { + pattern: r"radeon\s*(\(tm\)\s*)?graphics", + name: "AMD Radeon Graphics (APU)", + workgroup_divisor: 26, + min_workgroups: 384, + }, +]; + +const INTEL_TIERS: &[GpuTier] = &[ + // Battlemage (Arc B-Series) + GpuTier { + pattern: r"arc b|\bb5[78]0\b", + name: "Intel Arc B-Series (Battlemage)", + workgroup_divisor: 10, + min_workgroups: 2560, + }, + // Alchemist Mobile - check BEFORE desktop (a770m contains a770) + GpuTier { + pattern: r"\ba7[37]0m\b", + name: "Intel Arc A7 Mobile (Alchemist)", + workgroup_divisor: 14, + min_workgroups: 1536, + }, + GpuTier { + pattern: r"\ba5[57]0m\b", + name: "Intel Arc A5 Mobile (Alchemist)", + workgroup_divisor: 16, + min_workgroups: 1024, + }, + GpuTier { + pattern: r"\ba3[57]0m\b", + name: "Intel Arc A3 Mobile (Alchemist)", + workgroup_divisor: 20, + min_workgroups: 512, + }, + // Alchemist Desktop + GpuTier { + pattern: r"\ba7[57]0\b", + name: "Intel Arc A7 (Alchemist)", + workgroup_divisor: 12, + min_workgroups: 2048, + }, + GpuTier { + pattern: r"\ba580\b", + name: "Intel Arc A5 (Alchemist)", + workgroup_divisor: 14, + min_workgroups: 1536, + }, + GpuTier { + pattern: r"\ba3[18]0\b", + name: "Intel Arc A3 (Alchemist)", + workgroup_divisor: 18, + min_workgroups: 768, + }, + GpuTier { + pattern: r"arc a|\barc\b", + name: "Intel Arc (Unknown)", + workgroup_divisor: 16, + min_workgroups: 1024, + }, + // Integrated + GpuTier { + pattern: r"iris xe max", + name: "Intel Iris Xe Max (Discrete)", + workgroup_divisor: 20, + min_workgroups: 512, + }, + GpuTier { + pattern: r"iris xe", + name: "Intel Iris Xe (Integrated)", + workgroup_divisor: 24, + min_workgroups: 384, + }, + GpuTier { + pattern: r"iris pro", + name: "Intel Iris Pro (Integrated)", + workgroup_divisor: 26, + min_workgroups: 256, + }, + GpuTier { + pattern: r"iris plus|iris\b", + name: "Intel Iris Plus (Integrated)", + workgroup_divisor: 26, + min_workgroups: 320, + }, + GpuTier { + pattern: r"uhd.*(7\d{2}|graphics 7)", + name: "Intel UHD 700 (Integrated)", + workgroup_divisor: 26, + min_workgroups: 320, + }, + GpuTier { + pattern: r"uhd.*(6\d{2}|graphics 6)", + name: "Intel UHD 600 (Integrated)", + workgroup_divisor: 28, + min_workgroups: 256, + }, + GpuTier { + pattern: r"uhd", + name: "Intel UHD Graphics (Integrated)", + workgroup_divisor: 28, + min_workgroups: 256, + }, + GpuTier { + pattern: r"hd graphics|hd [456]\d{2}", + name: "Intel HD Graphics (Integrated)", + workgroup_divisor: 30, + min_workgroups: 192, + }, +]; + +const QUALCOMM_TIERS: &[GpuTier] = &[ + // Snapdragon X (Adreno X1) + GpuTier { + pattern: r"x elite|x plus|adreno x|x1-[89]", + name: "Qualcomm Adreno X1 (Snapdragon X)", + workgroup_divisor: 14, + min_workgroups: 1536, + }, + // Adreno 700 series (730, 740, 750, etc.) + GpuTier { + pattern: r"adreno\s*7[0-9]{2}\b|\b7[345]0\b", + name: "Qualcomm Adreno 700 Series", + workgroup_divisor: 16, + min_workgroups: 1024, + }, + // Adreno 600 series (610, 612, 615, 618, 619, 620, 630, 640, 650, 660, etc.) + GpuTier { + pattern: r"adreno\s*6[0-9]{2}\b|\b6[1-6][0-9]\b", + name: "Qualcomm Adreno 600 Series", + workgroup_divisor: 20, + min_workgroups: 512, + }, + // Adreno 500 series (505, 506, 508, 509, 510, 512, 530, 540) + GpuTier { + pattern: r"adreno\s*5[0-4][0-9]\b|\b5[0-4][0-9]\b", + name: "Qualcomm Adreno 500 Series", + workgroup_divisor: 24, + min_workgroups: 384, + }, +]; + +const APPLE_TIERS: &[GpuTier] = &[ + // M4 series + GpuTier { + pattern: r"m4 ultra", + name: "Apple M4 Ultra", + workgroup_divisor: 4, + min_workgroups: 1600, + }, + GpuTier { + pattern: r"m4 max", + name: "Apple M4 Max", + workgroup_divisor: 4, + min_workgroups: 800, + }, + GpuTier { + pattern: r"m4 pro", + name: "Apple M4 Pro", + workgroup_divisor: 4, + min_workgroups: 400, + }, + GpuTier { + pattern: r"\bm4\b", + name: "Apple M4", + workgroup_divisor: 4, + min_workgroups: 200, + }, + // M3 series + GpuTier { + pattern: r"m3 ultra", + name: "Apple M3 Ultra", + workgroup_divisor: 4, + min_workgroups: 1520, + }, + GpuTier { + pattern: r"m3 max", + name: "Apple M3 Max", + workgroup_divisor: 4, + min_workgroups: 800, + }, + GpuTier { + pattern: r"m3 pro", + name: "Apple M3 Pro", + workgroup_divisor: 4, + min_workgroups: 360, + }, + GpuTier { + pattern: r"\bm3\b", + name: "Apple M3", + workgroup_divisor: 4, + min_workgroups: 200, + }, + // M2 series + GpuTier { + pattern: r"m2 ultra", + name: "Apple M2 Ultra", + workgroup_divisor: 4, + min_workgroups: 1520, + }, + GpuTier { + pattern: r"m2 max", + name: "Apple M2 Max", + workgroup_divisor: 4, + min_workgroups: 760, + }, + GpuTier { + pattern: r"m2 pro", + name: "Apple M2 Pro", + workgroup_divisor: 4, + min_workgroups: 380, + }, + GpuTier { + pattern: r"\bm2\b", + name: "Apple M2", + workgroup_divisor: 4, + min_workgroups: 200, + }, + // M1 series + GpuTier { + pattern: r"m1 ultra", + name: "Apple M1 Ultra", + workgroup_divisor: 4, + min_workgroups: 1280, + }, + GpuTier { + pattern: r"m1 max", + name: "Apple M1 Max", + workgroup_divisor: 4, + min_workgroups: 640, + }, + GpuTier { + pattern: r"m1 pro", + name: "Apple M1 Pro", + workgroup_divisor: 4, + min_workgroups: 320, + }, + GpuTier { + pattern: r"\bm1\b", + name: "Apple M1", + workgroup_divisor: 4, + min_workgroups: 160, + }, +]; + +/// Compiled GPU tier tables (lazily initialized) +struct GpuTierTables { + nvidia: Vec, + amd: Vec, + intel: Vec, + qualcomm: Vec, + apple: Vec, +} + +static GPU_TIERS: LazyLock = LazyLock::new(|| GpuTierTables { + nvidia: NVIDIA_TIERS + .iter() + .map(CompiledGpuTier::from_tier) + .collect(), + amd: AMD_TIERS.iter().map(CompiledGpuTier::from_tier).collect(), + intel: INTEL_TIERS.iter().map(CompiledGpuTier::from_tier).collect(), + qualcomm: QUALCOMM_TIERS + .iter() + .map(CompiledGpuTier::from_tier) + .collect(), + apple: APPLE_TIERS.iter().map(CompiledGpuTier::from_tier).collect(), +}); + +/// Result of GPU tier matching +pub struct GpuTierMatch { + pub name: &'static str, + pub workgroup_divisor: u32, + pub min_workgroups: u32, + pub is_fallback: bool, +} + +/// Find matching tier from a list of compiled tiers +fn find_matching_tier(name: &str, tiers: &[CompiledGpuTier]) -> Option { + tiers + .iter() + .find(|tier| tier.regex.is_match(name)) + .map(|tier| GpuTierMatch { + name: tier.name, + workgroup_divisor: tier.workgroup_divisor, + min_workgroups: tier.min_workgroups, + is_fallback: false, + }) +} + +/// Detect GPU tier based on adapter info +/// +/// Returns (tier_name, workgroup_divisor, min_workgroups, is_fallback) +pub fn detect_gpu_tier(vendor_name: &str, vendor_id: u32, is_metal_backend: bool) -> GpuTierMatch { + let name_lower = vendor_name.to_lowercase(); + + // Determine vendor and find matching tier + if name_lower.contains("nvidia") || vendor_id == 0x10DE { + find_matching_tier(&name_lower, &GPU_TIERS.nvidia).unwrap_or(GpuTierMatch { + name: "NVIDIA Unknown", + workgroup_divisor: 20, + min_workgroups: 512, + is_fallback: true, + }) + } else if name_lower.contains("amd") || name_lower.contains("radeon") || vendor_id == 0x1002 { + find_matching_tier(&name_lower, &GPU_TIERS.amd).unwrap_or(GpuTierMatch { + name: "AMD Unknown", + workgroup_divisor: 24, + min_workgroups: 512, + is_fallback: true, + }) + } else if name_lower.contains("intel") || vendor_id == 0x8086 { + find_matching_tier(&name_lower, &GPU_TIERS.intel).unwrap_or(GpuTierMatch { + name: "Intel Unknown", + workgroup_divisor: 24, + min_workgroups: 256, + is_fallback: true, + }) + } else if name_lower.contains("qualcomm") + || name_lower.contains("adreno") + || vendor_id == 0x5143 + { + find_matching_tier(&name_lower, &GPU_TIERS.qualcomm).unwrap_or(GpuTierMatch { + name: "Qualcomm Adreno (Unknown)", + workgroup_divisor: 24, + min_workgroups: 384, + is_fallback: true, + }) + } else if is_metal_backend { + // Apple Silicon - detected by Metal backend + find_matching_tier(&name_lower, &GPU_TIERS.apple).unwrap_or(GpuTierMatch { + name: "Apple Silicon Unknown", + workgroup_divisor: 4, + min_workgroups: 160, + is_fallback: true, + }) + } else { + GpuTierMatch { + name: "Unknown GPU", + workgroup_divisor: 16, + min_workgroups: 512, + is_fallback: true, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_nvidia_detection() { + // RTX 40 series + let tier = detect_gpu_tier("NVIDIA GeForce RTX 4090", 0x10DE, false); + assert_eq!(tier.name, "NVIDIA RTX 40 Flagship (Ada)"); + assert!(!tier.is_fallback); + + let tier = detect_gpu_tier("NVIDIA GeForce RTX 4070 Ti", 0x10DE, false); + assert_eq!(tier.name, "NVIDIA RTX 40 (Ada)"); + + // RTX 30 series + let tier = detect_gpu_tier("NVIDIA GeForce RTX 3080", 0x10DE, false); + assert_eq!(tier.name, "NVIDIA RTX 30/20 (Ampere/Turing)"); + + // GTX 10 series + let tier = detect_gpu_tier("NVIDIA GeForce GTX 1080 Ti", 0x10DE, false); + assert_eq!(tier.name, "NVIDIA GTX 16/10 (Turing/Pascal)"); + } + + #[test] + fn test_amd_rdna_vs_polaris() { + // RDNA 1 - should NOT match Polaris + let tier = detect_gpu_tier("AMD Radeon RX 5500 XT", 0x1002, false); + assert_eq!(tier.name, "AMD RX 5000 (RDNA 1)"); + assert!(!tier.is_fallback); + + let tier = detect_gpu_tier("AMD Radeon RX 5600 XT", 0x1002, false); + assert_eq!(tier.name, "AMD RX 5000 (RDNA 1)"); + + let tier = detect_gpu_tier("AMD Radeon RX 5700 XT", 0x1002, false); + assert_eq!(tier.name, "AMD RX 5700 (RDNA 1)"); + + // Polaris - should match Polaris + let tier = detect_gpu_tier("AMD Radeon RX 580", 0x1002, false); + assert_eq!(tier.name, "AMD RX 500/400 (Polaris)"); + + let tier = detect_gpu_tier("AMD Radeon RX 560X", 0x1002, false); + assert_eq!(tier.name, "AMD RX 500/400 (Polaris)"); + + let tier = detect_gpu_tier("AMD Radeon RX 550", 0x1002, false); + assert_eq!(tier.name, "AMD RX 500/400 (Polaris)"); + } + + #[test] + fn test_amd_vega_apu() { + let tier = detect_gpu_tier("AMD Radeon(TM) Vega 8 Graphics", 0x1002, false); + assert_eq!(tier.name, "AMD Vega (APU)"); + assert!(!tier.is_fallback); + } + + #[test] + fn test_intel_arc_mobile_vs_desktop() { + // Mobile should match mobile tier + let tier = detect_gpu_tier("Intel Arc A770M", 0x8086, false); + assert_eq!(tier.name, "Intel Arc A7 Mobile (Alchemist)"); + assert!(!tier.is_fallback); + + // Desktop should match desktop tier + let tier = detect_gpu_tier("Intel Arc A770", 0x8086, false); + assert_eq!(tier.name, "Intel Arc A7 (Alchemist)"); + assert!(!tier.is_fallback); + } + + #[test] + fn test_apple_silicon() { + let tier = detect_gpu_tier("Apple M1 Pro", 0, true); + assert_eq!(tier.name, "Apple M1 Pro"); + assert!(!tier.is_fallback); + + let tier = detect_gpu_tier("Apple M3 Max", 0, true); + assert_eq!(tier.name, "Apple M3 Max"); + assert!(!tier.is_fallback); + } + + #[test] + fn test_qualcomm_adreno_series() { + // Adreno 627 should match 600 series, NOT 700 series + let tier = detect_gpu_tier("Qualcomm Adreno 627", 0x5143, false); + assert_eq!(tier.name, "Qualcomm Adreno 600 Series"); + assert!(!tier.is_fallback); + + // Adreno 730 should match 700 series + let tier = detect_gpu_tier("Qualcomm Adreno 730", 0x5143, false); + assert_eq!(tier.name, "Qualcomm Adreno 700 Series"); + assert!(!tier.is_fallback); + + // Adreno 540 should match 500 series + let tier = detect_gpu_tier("Qualcomm Adreno 540", 0x5143, false); + assert_eq!(tier.name, "Qualcomm Adreno 500 Series"); + assert!(!tier.is_fallback); + } +} diff --git a/crates/engine-gpu/src/lib.rs b/crates/engine-gpu/src/lib.rs index 8a525a0..db6936b 100644 --- a/crates/engine-gpu/src/lib.rs +++ b/crates/engine-gpu/src/lib.rs @@ -1,8 +1,12 @@ #![deny(rust_2018_idioms)] #![forbid(unsafe_code)] +mod gpu_tiers; + +pub mod end_to_end_tests; +pub mod tests; + use engine_cpu::{CancelCheck, Candidate, EngineStatus, FoundOrigin, MinerEngine, Range}; -use futures::executor::block_on; use pow_core::{format_hashrate, format_u512, JobContext}; use primitive_types::U512; use std::cell::RefCell; @@ -150,7 +154,19 @@ impl GpuEngine { if batch_size == 0 { return Err("batch_size must be non-zero".into()); } - block_on(Self::init(batch_size, throttle_ms)) + + // Handle both cases: called from within a tokio runtime or from outside + match tokio::runtime::Handle::try_current() { + Ok(handle) => { + // We're inside a tokio runtime - use block_in_place to allow blocking + tokio::task::block_in_place(|| handle.block_on(Self::init(batch_size, throttle_ms))) + } + Err(_) => { + // No runtime exists - create a temporary one + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(Self::init(batch_size, throttle_ms)) + } + } } async fn init(batch_size: u32, throttle_ms: u64) -> Result> { @@ -172,20 +188,63 @@ impl GpuEngine { let mut contexts = Vec::new(); let mut adapter_infos = Vec::new(); + + // Timeout for initializing each adapter (30 seconds should be plenty) + let init_timeout = std::time::Duration::from_secs(30); + for (i, adapter) in adapters.into_iter().enumerate() { let info = adapter.get_info(); - log::debug!(target: "gpu_engine", "Adapter {} raw info: {:?}", i, info); + log::info!(target: "gpu_engine", "Initializing adapter {}: {} ...", i, info.name); + + // Skip software renderers - they're too slow for mining + if info.name.to_lowercase().contains("microsoft basic") + || info.name.to_lowercase().contains("software") + || info.name.to_lowercase().contains("llvmpipe") + || info.device_type == wgpu::DeviceType::Cpu + { + log::warn!( + target: "gpu_engine", + "Skipping software renderer adapter {}: {}", + i, info.name + ); + continue; + } + adapter_infos.push(info.clone()); - let (device, queue) = adapter - .request_device(&wgpu::DeviceDescriptor { - label: Some("Mining Device"), - required_features: wgpu::Features::empty(), - required_limits: wgpu::Limits::default(), - memory_hints: Default::default(), - ..Default::default() - }) - .await?; + // Try to initialize this adapter with a proper timeout. + // If the driver hangs, we'll skip this adapter after the timeout. + let device_future = adapter.request_device(&wgpu::DeviceDescriptor { + label: Some("Mining Device"), + required_features: wgpu::Features::empty(), + required_limits: wgpu::Limits::default(), + memory_hints: Default::default(), + ..Default::default() + }); + + let device_result = match tokio::time::timeout(init_timeout, device_future).await { + Ok(result) => result, + Err(_) => { + log::warn!( + target: "gpu_engine", + "Adapter {} ({}) timed out after {}s during initialization, skipping", + i, info.name, init_timeout.as_secs() + ); + continue; + } + }; + + let (device, queue) = match device_result { + Ok(dq) => dq, + Err(e) => { + log::warn!( + target: "gpu_engine", + "Failed to initialize adapter {} ({}): {}. Skipping.", + i, info.name, e + ); + continue; + } + }; // Log device limits at debug level let limits = device.limits(); @@ -198,6 +257,9 @@ impl GpuEngine { limits.max_buffer_size ); + // Shader and pipeline creation are synchronous - can't timeout, but usually fast + let pipeline_start = std::time::Instant::now(); + let shader_source = include_str!("mining.wgsl"); let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { label: Some("Mining Shader"), @@ -213,7 +275,12 @@ impl GpuEngine { cache: None, }); - log::debug!(target: "gpu_engine", "Pipeline initialized for adapter {}", i); + let pipeline_elapsed = pipeline_start.elapsed(); + log::info!( + target: "gpu_engine", + "Adapter {} ({}) initialized successfully (pipeline compiled in {:.1}s)", + i, info.name, pipeline_elapsed.as_secs_f64() + ); // Calculate vendor-specific configuration once during initialization let optimal_workgroups = get_vendor_specific_dispatch(&info, &device); @@ -226,6 +293,11 @@ impl GpuEngine { })); } + if contexts.is_empty() { + log::error!(target: "gpu_engine", "No GPU adapters could be initialized successfully."); + return Err("No GPU adapters could be initialized".into()); + } + log::info!( target: "gpu_engine", "GPU engine initialized with {} devices (batch size: {} nonces, throttle: {}ms)", @@ -430,6 +502,19 @@ impl MinerEngine for GpuEngine { BatchResult::NotFound { hash_count } => { total_hashes += hash_count; } + BatchResult::DeviceLost => { + // GPU device is lost/unresponsive - log loudly and return cancelled + // This prevents spinning at 0 H/s indefinitely on a dead device + log::error!( + target: "gpu_engine", + "GPU {} device lost or unresponsive - stopping worker. \ + This GPU will not process further batches.", + device_index + ); + return EngineStatus::Cancelled { + hash_count: total_hashes, + }; + } } // Move to next batch @@ -498,6 +583,8 @@ enum BatchResult { NotFound { hash_count: u64, }, + /// GPU device is lost or unresponsive - caller should stop using this device + DeviceLost, } /// Run a single batch of GPU computation @@ -568,25 +655,49 @@ fn run_single_batch( // Wait for GPU to complete (blocking) let buffer_slice = resources.staging_buffer.slice(..); - let mapped = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); - let mapped_clone = mapped.clone(); + // Use atomic to track completion: 0 = pending, 1 = success, 2 = error + let map_status = std::sync::Arc::new(std::sync::atomic::AtomicU8::new(0)); + let map_status_clone = map_status.clone(); buffer_slice.map_async(wgpu::MapMode::Read, move |result| { - if result.is_ok() { - mapped_clone.store(true, Ordering::Release); - } + map_status_clone.store(if result.is_ok() { 1 } else { 2 }, Ordering::Release); }); - // Poll until complete - loop { + // Poll until complete, error, or timeout (30 seconds max to prevent infinite hang) + let poll_start = std::time::Instant::now(); + let max_poll_duration = std::time::Duration::from_secs(30); + let final_status = loop { let _ = gpu_ctx.device.poll(wgpu::PollType::Wait { submission_index: None, timeout: Some(std::time::Duration::from_millis(10)), }); - if mapped.load(Ordering::Acquire) { - break; + match map_status.load(Ordering::Acquire) { + 1 => break 1, // Success - buffer is mapped + 2 => { + // Mapping failed - buffer was never successfully mapped, don't unmap + log::error!( + target: "gpu_engine", + "GPU buffer mapping failed - possible device lost or resource error" + ); + return BatchResult::DeviceLost; + } + _ => { + // Still pending - check timeout + if poll_start.elapsed() > max_poll_duration { + log::error!( + target: "gpu_engine", + "GPU buffer mapping timed out after {}s - GPU may be unresponsive", + max_poll_duration.as_secs() + ); + // Timeout: map_async callback never fired, buffer not mapped, don't unmap + return BatchResult::DeviceLost; + } + } } - } + }; + + // Only reach here if final_status == 1 (success), buffer is mapped + debug_assert_eq!(final_status, 1); // Read results let data = buffer_slice.get_mapped_range(); @@ -634,269 +745,22 @@ fn get_vendor_specific_dispatch(adapter_info: &wgpu::AdapterInfo, device: &wgpu: let limits = device.limits(); let max_workgroups = limits.max_compute_workgroups_per_dimension.min(65535); - // Parse vendor from adapter info - let vendor_name = adapter_info.name.to_lowercase(); - let _device_name = adapter_info.device.to_string().to_lowercase(); - - // Vendor-specific heuristics based on architecture knowledge - // Returns (workgroups, tier_name, is_fallback) - let (optimal_workgroups, tier, is_fallback) = - if vendor_name.contains("nvidia") || adapter_info.vendor == 4318 { - // NVIDIA GPUs (vendor ID 0x10DE = 4318) - if vendor_name.contains("5090") || vendor_name.contains("5080") { - ( - (max_workgroups / 6).max(5120), - "NVIDIA RTX 50 Flagship (Blackwell)", - false, - ) - } else if vendor_name.contains("5070") - || vendor_name.contains("5060") - || vendor_name.contains("rtx 50") - { - ( - (max_workgroups / 7).max(4608), - "NVIDIA RTX 50 (Blackwell)", - false, - ) - } else if vendor_name.contains("4090") || vendor_name.contains("4080") { - ( - (max_workgroups / 8).max(4096), - "NVIDIA RTX 40 Flagship (Ada)", - false, - ) - } else if vendor_name.contains("rtx 40") - || vendor_name.contains("4070") - || vendor_name.contains("4060") - { - ( - (max_workgroups / 10).max(3072), - "NVIDIA RTX 40 (Ada)", - false, - ) - } else if vendor_name.contains("rtx 30") - || vendor_name.contains("rtx 20") - || vendor_name.contains("3090") - || vendor_name.contains("3080") - || vendor_name.contains("3070") - || vendor_name.contains("2080") - || vendor_name.contains("2070") - || vendor_name.contains("2060") - { - ( - (max_workgroups / 12).max(2048), - "NVIDIA RTX 30/20 (Ampere/Turing)", - false, - ) - } else if vendor_name.contains("gtx 16") - || vendor_name.contains("gtx 10") - || vendor_name.contains("1660") - || vendor_name.contains("1650") - || vendor_name.contains("1080") - || vendor_name.contains("1070") - || vendor_name.contains("1060") - { - ( - (max_workgroups / 16).max(1024), - "NVIDIA GTX 16/10 (Turing/Pascal)", - false, - ) - } else if vendor_name.contains("gtx") { - ((max_workgroups / 18).max(768), "NVIDIA GTX (Legacy)", false) - } else if vendor_name.contains("quadro") - || vendor_name.contains("rtx a") - || vendor_name.contains("tesla") - { - ( - (max_workgroups / 10).max(2560), - "NVIDIA Quadro/Professional", - false, - ) - } else { - ((max_workgroups / 20).max(512), "NVIDIA Unknown", true) - } - } else if vendor_name.contains("amd") - || vendor_name.contains("radeon") - || adapter_info.vendor == 4098 - { - // AMD GPUs (vendor ID 0x1002 = 4098) - if vendor_name.contains("rx 9") - || vendor_name.contains("9070") - || vendor_name.contains("9080") - { - ( - (max_workgroups / 8).max(4096), - "AMD RX 9000 (RDNA 4)", - false, - ) - } else if vendor_name.contains("7900") { - ( - (max_workgroups / 9).max(3584), - "AMD RX 7900 (RDNA 3 Flagship)", - false, - ) - } else if vendor_name.contains("rx 7") - || vendor_name.contains("7800") - || vendor_name.contains("7700") - || vendor_name.contains("7600") - { - ( - (max_workgroups / 10).max(3072), - "AMD RX 7000 (RDNA 3)", - false, - ) - } else if vendor_name.contains("6900") || vendor_name.contains("6800") { - ( - (max_workgroups / 12).max(2560), - "AMD RX 6900/6800 (RDNA 2 Flagship)", - false, - ) - } else if vendor_name.contains("rx 6") - || vendor_name.contains("6700") - || vendor_name.contains("6600") - { - ( - (max_workgroups / 14).max(2048), - "AMD RX 6000 (RDNA 2)", - false, - ) - } else if vendor_name.contains("5700") { - ( - (max_workgroups / 16).max(1536), - "AMD RX 5700 (RDNA 1)", - false, - ) - } else if vendor_name.contains("rx 5") - || vendor_name.contains("5600") - || vendor_name.contains("5500") - { - ( - (max_workgroups / 18).max(1024), - "AMD RX 5000 (RDNA 1)", - false, - ) - } else if vendor_name.contains("rx 4") - || vendor_name.contains("580") - || vendor_name.contains("570") - { - ( - (max_workgroups / 20).max(768), - "AMD RX 500/400 (Polaris)", - false, - ) - } else if vendor_name.contains("radeon pro") - || vendor_name.contains("instinct") - || vendor_name.contains("mi") - { - ( - (max_workgroups / 10).max(2560), - "AMD Radeon Pro/Instinct", - false, - ) - } else { - ((max_workgroups / 24).max(512), "AMD Unknown", true) - } - } else if vendor_name.contains("intel") || adapter_info.vendor == 32902 { - // Intel GPUs (vendor ID 0x8086 = 32902) - if vendor_name.contains("arc b") - || vendor_name.contains("b580") - || vendor_name.contains("b570") - { - ( - (max_workgroups / 10).max(2560), - "Intel Arc B-Series (Battlemage)", - false, - ) - } else if vendor_name.contains("a770") || vendor_name.contains("a750") { - ( - (max_workgroups / 12).max(2048), - "Intel Arc A7 (Alchemist)", - false, - ) - } else if vendor_name.contains("a580") - || vendor_name.contains("a380") - || vendor_name.contains("arc a5") - || vendor_name.contains("arc a3") - { - ( - (max_workgroups / 16).max(1024), - "Intel Arc A5/A3 (Alchemist)", - false, - ) - } else if vendor_name.contains("a310") { - ((max_workgroups / 20).max(512), "Intel Arc A3 Entry", false) - } else if vendor_name.contains("iris xe") || vendor_name.contains("iris plus") { - ( - (max_workgroups / 24).max(384), - "Intel Iris Xe/Plus (Integrated)", - false, - ) - } else if vendor_name.contains("uhd") || vendor_name.contains("hd graphics") { - ( - (max_workgroups / 28).max(256), - "Intel UHD/HD Graphics (Integrated)", - false, - ) - } else { - ((max_workgroups / 24).max(256), "Intel Unknown", true) - } - } else if adapter_info.backend == wgpu::Backend::Metal { - // Apple GPUs (detected by Metal backend) - let (gpu_cores, workgroups, tier) = if vendor_name.contains("m4 ultra") { - (80, 1600, "Apple M4 Ultra") - } else if vendor_name.contains("m4 max") { - (40, 800, "Apple M4 Max") - } else if vendor_name.contains("m4 pro") { - (20, 400, "Apple M4 Pro") - } else if vendor_name.contains("m4") { - (10, 200, "Apple M4") - } else if vendor_name.contains("m3 ultra") { - (76, 1520, "Apple M3 Ultra") - } else if vendor_name.contains("m3 max") { - (40, 800, "Apple M3 Max") - } else if vendor_name.contains("m3 pro") { - (18, 360, "Apple M3 Pro") - } else if vendor_name.contains("m3") { - (10, 200, "Apple M3") - } else if vendor_name.contains("m2 ultra") { - (76, 1520, "Apple M2 Ultra") - } else if vendor_name.contains("m2 max") { - (38, 760, "Apple M2 Max") - } else if vendor_name.contains("m2 pro") { - (19, 380, "Apple M2 Pro") - } else if vendor_name.contains("m2") { - (10, 200, "Apple M2") - } else if vendor_name.contains("m1 ultra") { - (64, 1280, "Apple M1 Ultra") - } else if vendor_name.contains("m1 max") { - (32, 640, "Apple M1 Max") - } else if vendor_name.contains("m1 pro") { - (16, 320, "Apple M1 Pro") - } else if vendor_name.contains("m1") { - (8, 160, "Apple M1") - } else { - (8, 160, "Apple Silicon Unknown") - }; + let is_metal = adapter_info.backend == wgpu::Backend::Metal; + let tier = gpu_tiers::detect_gpu_tier(&adapter_info.name, adapter_info.vendor, is_metal); - let clamped_workgroups = workgroups.min(max_workgroups / 4).max(64); - let _ = gpu_cores; // gpu_cores currently unused but kept for potential future tuning - let is_fallback = tier == "Apple Silicon Unknown"; - (clamped_workgroups, tier, is_fallback) - } else { - // Unknown/Generic GPU - use conservative defaults - ((max_workgroups / 16).max(512), "Unknown GPU", true) - }; + let optimal_workgroups = (max_workgroups / tier.workgroup_divisor).max(tier.min_workgroups); // Log GPU detection result log::info!( target: "gpu_engine", "GPU detected: {} | tier: {} | workgroups: {} (max: {})", adapter_info.name, - tier, + tier.name, optimal_workgroups, max_workgroups ); - if is_fallback { + if tier.is_fallback { log::warn!( target: "gpu_engine", "GPU not recognized, using fallback config. Please report: name='{}', vendor=0x{:04X}, device={}", diff --git a/crates/engine-gpu/src/main.rs b/crates/engine-gpu/src/main.rs index 3262af9..19a1f18 100644 --- a/crates/engine-gpu/src/main.rs +++ b/crates/engine-gpu/src/main.rs @@ -1,10 +1,9 @@ -use futures::executor::block_on; - mod end_to_end_tests; mod tests; -fn main() { - block_on(run()).unwrap(); +#[tokio::main] +async fn main() -> Result<(), Box> { + run().await } async fn run() -> Result<(), Box> { diff --git a/crates/miner-cli/src/main.rs b/crates/miner-cli/src/main.rs index b2bbc55..1b4d627 100644 --- a/crates/miner-cli/src/main.rs +++ b/crates/miner-cli/src/main.rs @@ -166,10 +166,11 @@ async fn main() { fn init_logger(verbose: bool) { if std::env::var("RUST_LOG").is_err() { + // Filter out noisy wgpu/naga shader compilation logs let log_level = if verbose { - "debug,miner=debug,gpu_engine=debug,engine_cpu=debug" + "debug,miner=debug,gpu_engine=debug,engine_cpu=debug,wgpu=warn,wgpu_core=warn,wgpu_hal=warn,naga=warn" } else { - "info,miner=info,gpu_engine=info" + "info,miner=info,gpu_engine=info,wgpu=error,wgpu_core=error,wgpu_hal=error,naga=error" }; std::env::set_var("RUST_LOG", log_level); }