GPU Fallback Developer Guide
GPU Fallback Developer Guide
Last Updated: 2025-11-24 Audience: HeliosDB Developers Topic: GPU Operation Fallback Patterns
Quick Reference
Check GPU Capabilities
use heliosdb_gpu::capability;
// Check if GPU is available at allif capability::is_gpu_available() { println!("GPU detected");}
// Check specific operation supportif capability::supports_window_functions() { println!("Window functions can use GPU");}
if capability::supports_advanced_aggregations() { println!("Advanced aggregations can use GPU");}
if capability::supports_hash_joins() { println!("Hash joins can use GPU");}
// Get full capability reportprintln!("{}", capability::capability_report());Implement GPU Operation with Fallback
Pattern 1: Full CPU Fallback (Recommended)
Use when CPU implementation is straightforward:
use crate::capability;use crate::cpu_fallback;
pub fn my_gpu_operation( &self, data: &[f64], config: &Config,) -> Result<Vec<f64>> { // Step 1: Check GPU capability if !capability::supports_my_operation() { // Step 2: Fallback to CPU implementation return cpu_fallback::my_operation_cpu(data, config); }
// Step 3: GPU implementation (if available) // ... allocate GPU memory ... // ... launch CUDA kernel ... // ... return results ...
todo!("GPU kernel implementation pending")}Pattern 2: Clear Error Message
Use when CPU implementation is complex and not yet available:
use crate::capability;
pub fn complex_gpu_operation( &self, data: &ComplexData,) -> Result<ComplexResult> { // Step 1: Check GPU capability if !capability::supports_complex_operation() { // Step 2: Return clear error return Err(Error::OperationNotSupported( "Complex operation requires GPU (CPU fallback not yet implemented)".into() )); }
// Step 3: GPU implementation todo!("GPU kernel implementation pending")}Adding New GPU Operations
Step 1: Add Capability Flag
Edit /home/claude/HeliosDB/heliosdb-gpu/src/capability.rs:
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]pub enum GpuCapability { // ... existing capabilities ...
/// Your new operation MyNewOperation,}Add detection:
impl CapabilityCache { fn detect(&mut self) { // ... existing detection ...
self.capabilities.insert( GpuCapability::MyNewOperation, self.detect_my_operation_kernels(), ); }
fn detect_my_operation_kernels(&self) -> bool { // Return true if kernels are compiled and available // For now, return false until GPU kernels are implemented false }}Add public function:
/// Check if my operation is supported on GPUpub fn supports_my_operation() -> bool { has_capability(GpuCapability::DeviceAvailable) && has_capability(GpuCapability::MyNewOperation)}Update capability report:
pub fn capability_report() -> String { // ... existing code ...
let cap_names = [ // ... existing capabilities ... (GpuCapability::MyNewOperation, "My New Operation"), ];
// ... rest of function ...}Step 2: Implement CPU Fallback
Edit /home/claude/HeliosDB/heliosdb-gpu/src/cpu_fallback.rs:
/// CPU implementation of my operationpub fn my_operation_cpu( data: &[f64], config: &Config,) -> Result<Vec<f64>> { let mut result = Vec::with_capacity(data.len());
// Implement algorithm for &value in data.iter() { let processed = /* your CPU algorithm */; result.push(processed); }
Ok(result)}
#[cfg(test)]mod tests { use super::*;
#[test] fn test_my_operation_cpu() { let data = vec![1.0, 2.0, 3.0]; let config = Config::default(); let result = my_operation_cpu(&data, &config).unwrap();
// Assert correctness assert_eq!(result.len(), data.len()); // ... more assertions ... }}Step 3: Implement GPU Operation with Fallback
Edit your GPU module (e.g., /home/claude/HeliosDB/heliosdb-gpu/src/my_operations.rs):
use heliosdb_common::{Result, HeliosError as Error};use std::sync::Arc;use crate::device::GpuDevice;use crate::memory_v2::{GpuBuffer, AdvancedGpuAllocator};use crate::capability;use crate::cpu_fallback;
pub struct GpuMyOperations { device: Arc<GpuDevice>, allocator: Arc<AdvancedGpuAllocator>,}
impl GpuMyOperations { pub fn new( device: Arc<GpuDevice>, allocator: Arc<AdvancedGpuAllocator>, ) -> Result<Self> { Ok(GpuMyOperations { device, allocator }) }
pub fn execute( &self, data: &[f64], config: &Config, ) -> Result<Vec<f64>> { let start = std::time::Instant::now();
// Check GPU capability if !capability::supports_my_operation() { // Fallback to CPU return cpu_fallback::my_operation_cpu(data, config); }
// Allocate GPU memory let data_buf = self.allocator.allocate( data.len() * 8, crate::memory_v2::DeviceId(0) )?;
// Copy data to GPU data_buf.copy_from_host(data)?;
// TODO: Launch CUDA kernel // - Kernel: my_operations.cu::my_kernel // - Grid/block configuration // - Launch and synchronize
todo!("GPU kernel implementation pending") }}Step 4: Export from lib.rs
Edit /home/claude/HeliosDB/heliosdb-gpu/src/lib.rs:
pub mod my_operations;
pub use my_operations::{ GpuMyOperations, // ... other exports ...};
// Update capability exports if neededpub use capability::{ // ... existing exports ... supports_my_operation,};Step 5: Write Tests
Create /home/claude/HeliosDB/heliosdb-gpu/tests/my_operations_tests.rs:
use heliosdb_gpu::{GpuMyOperations, capability};
#[test]fn test_my_operation_cpu_fallback() { // This test should pass even without GPU let data = vec![1.0, 2.0, 3.0, 4.0, 5.0]; let config = Config::default();
// Even if GPU not available, should get correct CPU results let result = execute_my_operation(&data, &config).unwrap();
assert_eq!(result.len(), data.len()); // Assert correctness...}
#[test]fn test_my_operation_with_gpu() { if !capability::supports_my_operation() { println!("Skipping GPU test - GPU not available"); return; }
// GPU-specific test let data = vec![/* large dataset */]; let result = execute_my_operation(&data, &config).unwrap();
// Assert GPU-specific behavior (performance, etc.)}Common Patterns
Pattern: Partition-Based Operations
pub fn partition_operation_cpu( partition_keys: &[i32], data_values: &[f64],) -> Result<Vec<f64>> { let mut result = vec![0.0; data_values.len()]; let mut current_partition = None; let mut partition_state = State::default();
for (idx, (&key, &value)) in partition_keys.iter().zip(data_values.iter()).enumerate() { if current_partition != Some(key) { // New partition - reset state current_partition = Some(key); partition_state = State::default(); }
// Process within partition partition_state.update(value); result[idx] = partition_state.compute(); }
Ok(result)}Pattern: Group-Based Aggregations
pub fn group_aggregation_cpu( group_keys: &[i32], data_values: &[f64],) -> Result<Vec<(i32, f64)>> { let mut group_data: HashMap<i32, AggregateState> = HashMap::new();
// Accumulate for (&key, &value) in group_keys.iter().zip(data_values.iter()) { let state = group_data.entry(key).or_default(); state.accumulate(value); }
// Finalize let mut result: Vec<(i32, f64)> = group_data .into_iter() .map(|(key, state)| (key, state.finalize())) .collect();
result.sort_by_key(|(k, _)| *k); Ok(result)}Pattern: Numerically Stable Algorithms
// Welford's algorithm for variance/stddevpub fn welford_cpu( group_keys: &[i32], data_values: &[f64],) -> Result<Vec<(i32, f64)>> { let mut group_stats: HashMap<i32, (usize, f64, f64)> = HashMap::new();
for (&key, &value) in group_keys.iter().zip(data_values.iter()) { let entry = group_stats.entry(key).or_insert((0, 0.0, 0.0)); let (count, mean, m2) = *entry;
let new_count = count + 1; let delta = value - mean; let new_mean = mean + delta / new_count as f64; let delta2 = value - new_mean; let new_m2 = m2 + delta * delta2;
*entry = (new_count, new_mean, new_m2); }
// Compute final result from (count, mean, m2) // ...}Testing Strategy
Unit Tests (CPU Fallback)
Always write unit tests for CPU fallback implementations:
#[cfg(test)]mod tests { use super::*;
#[test] fn test_empty_input() { let result = my_operation_cpu(&[], &Config::default()).unwrap(); assert_eq!(result.len(), 0); }
#[test] fn test_single_value() { let result = my_operation_cpu(&[42.0], &Config::default()).unwrap(); assert_eq!(result.len(), 1); }
#[test] fn test_correctness() { let input = vec![1.0, 2.0, 3.0]; let result = my_operation_cpu(&input, &Config::default()).unwrap();
// Assert specific values assert_eq!(result, vec![/* expected */]); }
#[test] fn test_edge_cases() { // NaN, infinity, very large/small values let input = vec![f64::NAN, f64::INFINITY, 0.0, -0.0]; let result = my_operation_cpu(&input, &Config::default());
// Assert proper handling }}Integration Tests (GPU + Fallback)
#[test]fn test_cpu_gpu_equivalence() { if !capability::is_gpu_available() { println!("Skipping equivalence test - GPU not available"); return; }
let input = generate_test_data();
// Execute on CPU let cpu_result = cpu_fallback::my_operation_cpu(&input, &config).unwrap();
// Execute on GPU let gpu_result = gpu_executor.execute(&input, &config).unwrap();
// Results should be identical (or within numerical tolerance) assert_eq!(cpu_result.len(), gpu_result.len()); for (cpu_val, gpu_val) in cpu_result.iter().zip(gpu_result.iter()) { assert!((cpu_val - gpu_val).abs() < 1e-10); }}Debugging Tips
Check Capability Detection
// Print full capability reportprintln!("{}", heliosdb_gpu::capability::capability_report());
// Check specific capabilityif !heliosdb_gpu::supports_window_functions() { println!("Window functions will use CPU fallback");}
// Force re-detection (useful during development)heliosdb_gpu::refresh_capabilities();Add Logging
use tracing::{info, warn};
pub fn my_operation(&self, data: &[f64]) -> Result<Vec<f64>> { if !capability::supports_my_operation() { warn!("GPU not available, using CPU fallback for my_operation"); return cpu_fallback::my_operation_cpu(data); }
info!("Using GPU acceleration for my_operation"); // GPU implementation...}Benchmarking
use std::time::Instant;
// Compare CPU vs GPU performancelet start = Instant::now();let cpu_result = cpu_fallback::my_operation_cpu(&data, &config)?;let cpu_time = start.elapsed();
let start = Instant::now();let gpu_result = gpu_executor.execute(&data, &config)?;let gpu_time = start.elapsed();
println!("CPU: {:?}, GPU: {:?}, Speedup: {:.2}x", cpu_time, gpu_time, cpu_time.as_secs_f64() / gpu_time.as_secs_f64());Best Practices
1. Always Implement CPU Fallback First
// Good: CPU fallback ensures correctnesspub fn new_operation(&self, data: &[f64]) -> Result<Vec<f64>> { if !capability::supports_new_operation() { return cpu_fallback::new_operation_cpu(data); } // GPU implementation (can be added later)}
// Bad: No fallback, operation will panicpub fn new_operation(&self, data: &[f64]) -> Result<Vec<f64>> { // GPU implementation only todo!("Not implemented")}2. Use Descriptive Error Messages
// Good: Clear explanationif !capability::supports_correlation() { return Err(Error::OperationNotSupported( "Correlation requires GPU acceleration (CPU fallback not yet implemented). \ Please enable GPU support or use simpler statistical functions.".into() ));}
// Bad: Generic errorif !capability::supports_correlation() { return Err(Error::OperationNotSupported("Not supported".into()));}3. Test Both Paths
#[test]fn test_with_cpu_fallback() { // Should pass even without GPU let result = execute_operation(&data).unwrap(); assert_correct(result);}
#[test]fn test_with_gpu() { if !capability::is_gpu_available() { return; // Skip if GPU not available } let result = execute_operation(&data).unwrap(); assert_correct(result);}4. Document Fallback Behavior
/// Execute my GPU-accelerated operation////// # Performance/// - With GPU: 10-50x speedup over CPU/// - Without GPU: Falls back to CPU implementation (correct but slower)////// # Arguments/// * `data` - Input data////// # Returns/// Processed results (always correct, regardless of GPU availability)pub fn my_operation(&self, data: &[f64]) -> Result<Vec<f64>> { // Implementation...}Common Pitfalls
Pitfall 1: Forgetting Capability Check
// Wrong: No capability checkpub fn operation(&self, data: &[f64]) -> Result<Vec<f64>> { // Assumes GPU always available launch_gpu_kernel(data)?; Ok(results)}
// Correct: Always check capabilitypub fn operation(&self, data: &[f64]) -> Result<Vec<f64>> { if !capability::supports_operation() { return cpu_fallback::operation_cpu(data); } launch_gpu_kernel(data)?; Ok(results)}Pitfall 2: Inconsistent Results
// Wrong: CPU and GPU return different resultsfn cpu_version() -> f64 { 1.0 / 3.0 } // 0.333...fn gpu_version() -> f64 { 0.333 } // Truncated
// Correct: Both return numerically equivalent resultsfn cpu_version() -> f64 { 1.0 / 3.0 }fn gpu_version() -> f64 { 1.0 / 3.0 } // Same calculationPitfall 3: Ignoring Edge Cases
// Wrong: Doesn't handle empty inputpub fn operation_cpu(data: &[f64]) -> Result<Vec<f64>> { let mut result = Vec::new(); let first = data[0]; // Panic if data is empty! // ...}
// Correct: Handle edge casespub fn operation_cpu(data: &[f64]) -> Result<Vec<f64>> { if data.is_empty() { return Ok(Vec::new()); } let first = data[0]; // ...}Example: Complete Implementation
Here’s a complete example implementing a new GPU operation with fallback:
use heliosdb_common::{Result, HeliosError as Error};use std::sync::Arc;use crate::device::GpuDevice;use crate::memory_v2::{GpuBuffer, AdvancedGpuAllocator};use crate::capability;use crate::cpu_fallback;
/// My new GPU-accelerated featurepub struct GpuMyNewFeature { device: Arc<GpuDevice>, allocator: Arc<AdvancedGpuAllocator>,}
impl GpuMyNewFeature { pub fn new( device: Arc<GpuDevice>, allocator: Arc<AdvancedGpuAllocator>, ) -> Result<Self> { Ok(GpuMyNewFeature { device, allocator }) }
/// Execute my feature /// /// # Performance /// - GPU: 20x faster than CPU /// - CPU fallback: Always correct /// /// # Arguments /// * `input` - Input data /// /// # Returns /// Processed results pub fn execute(&self, input: &[f64]) -> Result<Vec<f64>> { // Check GPU capability if !capability::supports_my_new_feature() { tracing::info!("Using CPU fallback for my_new_feature"); return cpu_fallback::my_new_feature_cpu(input); }
tracing::info!("Using GPU acceleration for my_new_feature");
// Allocate GPU memory let input_buf = self.allocator.allocate( input.len() * 8, crate::memory_v2::DeviceId(0) )?;
// Copy to GPU input_buf.copy_from_host(input)?;
// TODO: Launch CUDA kernel // - Kernel: my_new_feature.cu::process_kernel // - Grid: (input.len() + 255) / 256 blocks // - Block: 256 threads
todo!("GPU kernel implementation pending") }}
// File: heliosdb-gpu/src/cpu_fallback.rs
/// CPU implementation of my_new_featurepub fn my_new_feature_cpu(input: &[f64]) -> Result<Vec<f64>> { let mut result = Vec::with_capacity(input.len());
for &value in input.iter() { // CPU algorithm let processed = value * 2.0; // Example result.push(processed); }
Ok(result)}
#[cfg(test)]mod tests { use super::*;
#[test] fn test_my_new_feature_cpu() { let input = vec![1.0, 2.0, 3.0]; let result = my_new_feature_cpu(&input).unwrap(); assert_eq!(result, vec![2.0, 4.0, 6.0]); }}Summary Checklist
When implementing a new GPU operation:
- Add capability flag to
capability.rs - Implement CPU fallback in
cpu_fallback.rs - Write unit tests for CPU fallback
- Implement GPU operation with fallback check
- Add operation to module exports in
lib.rs - Write integration tests (CPU and GPU paths)
- Document fallback behavior
- Add logging for fallback usage
- Verify edge cases handled
- Ensure CPU/GPU numerical equivalence
Guide Version: 1.0
Last Updated: 2025-11-24
Questions? See /home/claude/HeliosDB/docs/implementation/GPU_FALLBACK_IMPLEMENTATION_REPORT.md