GPU Fallback Developer Guide

Last Updated: 2025-11-24 Audience: HeliosDB Developers Topic: GPU Operation Fallback Patterns

Quick Reference

Check GPU Capabilities

use heliosdb_gpu::capability;

// Check if GPU is available at all
if capability::is_gpu_available() {
    println!("GPU detected");
}

// Check specific operation support
if capability::supports_window_functions() {
    println!("Window functions can use GPU");
}

if capability::supports_advanced_aggregations() {
    println!("Advanced aggregations can use GPU");
}

if capability::supports_hash_joins() {
    println!("Hash joins can use GPU");
}

// Get full capability report
println!("{}", capability::capability_report());

Implement GPU Operation with Fallback

Pattern 1: Full CPU Fallback (Recommended)

Use when CPU implementation is straightforward:

use crate::capability;
use crate::cpu_fallback;

pub fn my_gpu_operation(
    &self,
    data: &[f64],
    config: &Config,
) -> Result<Vec<f64>> {
    // Step 1: Check GPU capability
    if !capability::supports_my_operation() {
        // Step 2: Fallback to CPU implementation
        return cpu_fallback::my_operation_cpu(data, config);
    }

    // Step 3: GPU implementation (if available)
    // ... allocate GPU memory ...
    // ... launch CUDA kernel ...
    // ... return results ...

    todo!("GPU kernel implementation pending")
}

Pattern 2: Clear Error Message

Use when CPU implementation is complex and not yet available:

use crate::capability;

pub fn complex_gpu_operation(
    &self,
    data: &ComplexData,
) -> Result<ComplexResult> {
    // Step 1: Check GPU capability
    if !capability::supports_complex_operation() {
        // Step 2: Return clear error
        return Err(Error::OperationNotSupported(
            "Complex operation requires GPU (CPU fallback not yet implemented)".into()
        ));
    }

    // Step 3: GPU implementation
    todo!("GPU kernel implementation pending")
}

Adding New GPU Operations

Step 1: Add Capability Flag

Edit /home/claude/HeliosDB/heliosdb-gpu/src/capability.rs:

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GpuCapability {
    // ... existing capabilities ...

    /// Your new operation
    MyNewOperation,
}

Add detection:

impl CapabilityCache {
    fn detect(&mut self) {
        // ... existing detection ...

        self.capabilities.insert(
            GpuCapability::MyNewOperation,
            self.detect_my_operation_kernels(),
        );
    }

    fn detect_my_operation_kernels(&self) -> bool {
        // Return true if kernels are compiled and available
        // For now, return false until GPU kernels are implemented
        false
    }
}

Add public function:

/// Check if my operation is supported on GPU
pub fn supports_my_operation() -> bool {
    has_capability(GpuCapability::DeviceAvailable)
        && has_capability(GpuCapability::MyNewOperation)
}

Update capability report:

pub fn capability_report() -> String {
    // ... existing code ...

    let cap_names = [
        // ... existing capabilities ...
        (GpuCapability::MyNewOperation, "My New Operation"),
    ];

    // ... rest of function ...
}

Step 2: Implement CPU Fallback

Edit /home/claude/HeliosDB/heliosdb-gpu/src/cpu_fallback.rs:

/// CPU implementation of my operation
pub fn my_operation_cpu(
    data: &[f64],
    config: &Config,
) -> Result<Vec<f64>> {
    let mut result = Vec::with_capacity(data.len());

    // Implement algorithm
    for &value in data.iter() {
        let processed = /* your CPU algorithm */;
        result.push(processed);
    }

    Ok(result)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_my_operation_cpu() {
        let data = vec![1.0, 2.0, 3.0];
        let config = Config::default();
        let result = my_operation_cpu(&data, &config).unwrap();

        // Assert correctness
        assert_eq!(result.len(), data.len());
        // ... more assertions ...
    }
}

Step 3: Implement GPU Operation with Fallback

Edit your GPU module (e.g., /home/claude/HeliosDB/heliosdb-gpu/src/my_operations.rs):

use heliosdb_common::{Result, HeliosError as Error};
use std::sync::Arc;
use crate::device::GpuDevice;
use crate::memory_v2::{GpuBuffer, AdvancedGpuAllocator};
use crate::capability;
use crate::cpu_fallback;

pub struct GpuMyOperations {
    device: Arc<GpuDevice>,
    allocator: Arc<AdvancedGpuAllocator>,
}

impl GpuMyOperations {
    pub fn new(
        device: Arc<GpuDevice>,
        allocator: Arc<AdvancedGpuAllocator>,
    ) -> Result<Self> {
        Ok(GpuMyOperations { device, allocator })
    }

    pub fn execute(
        &self,
        data: &[f64],
        config: &Config,
    ) -> Result<Vec<f64>> {
        let start = std::time::Instant::now();

        // Check GPU capability
        if !capability::supports_my_operation() {
            // Fallback to CPU
            return cpu_fallback::my_operation_cpu(data, config);
        }

        // Allocate GPU memory
        let data_buf = self.allocator.allocate(
            data.len() * 8,
            crate::memory_v2::DeviceId(0)
        )?;

        // Copy data to GPU
        data_buf.copy_from_host(data)?;

        // TODO: Launch CUDA kernel
        // - Kernel: my_operations.cu::my_kernel
        // - Grid/block configuration
        // - Launch and synchronize

        todo!("GPU kernel implementation pending")
    }
}

Step 4: Export from lib.rs

Edit /home/claude/HeliosDB/heliosdb-gpu/src/lib.rs:

pub mod my_operations;

pub use my_operations::{
    GpuMyOperations,
    // ... other exports ...
};

// Update capability exports if needed
pub use capability::{
    // ... existing exports ...
    supports_my_operation,
};

Step 5: Write Tests

Create /home/claude/HeliosDB/heliosdb-gpu/tests/my_operations_tests.rs:

use heliosdb_gpu::{GpuMyOperations, capability};

#[test]
fn test_my_operation_cpu_fallback() {
    // This test should pass even without GPU
    let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
    let config = Config::default();

    // Even if GPU not available, should get correct CPU results
    let result = execute_my_operation(&data, &config).unwrap();

    assert_eq!(result.len(), data.len());
    // Assert correctness...
}

#[test]
fn test_my_operation_with_gpu() {
    if !capability::supports_my_operation() {
        println!("Skipping GPU test - GPU not available");
        return;
    }

    // GPU-specific test
    let data = vec![/* large dataset */];
    let result = execute_my_operation(&data, &config).unwrap();

    // Assert GPU-specific behavior (performance, etc.)
}

Common Patterns

Pattern: Partition-Based Operations

pub fn partition_operation_cpu(
    partition_keys: &[i32],
    data_values: &[f64],
) -> Result<Vec<f64>> {
    let mut result = vec![0.0; data_values.len()];
    let mut current_partition = None;
    let mut partition_state = State::default();

    for (idx, (&key, &value)) in partition_keys.iter().zip(data_values.iter()).enumerate() {
        if current_partition != Some(key) {
            // New partition - reset state
            current_partition = Some(key);
            partition_state = State::default();
        }

        // Process within partition
        partition_state.update(value);
        result[idx] = partition_state.compute();
    }

    Ok(result)
}

Pattern: Group-Based Aggregations

pub fn group_aggregation_cpu(
    group_keys: &[i32],
    data_values: &[f64],
) -> Result<Vec<(i32, f64)>> {
    let mut group_data: HashMap<i32, AggregateState> = HashMap::new();

    // Accumulate
    for (&key, &value) in group_keys.iter().zip(data_values.iter()) {
        let state = group_data.entry(key).or_default();
        state.accumulate(value);
    }

    // Finalize
    let mut result: Vec<(i32, f64)> = group_data
        .into_iter()
        .map(|(key, state)| (key, state.finalize()))
        .collect();

    result.sort_by_key(|(k, _)| *k);
    Ok(result)
}

Pattern: Numerically Stable Algorithms

// Welford's algorithm for variance/stddev
pub fn welford_cpu(
    group_keys: &[i32],
    data_values: &[f64],
) -> Result<Vec<(i32, f64)>> {
    let mut group_stats: HashMap<i32, (usize, f64, f64)> = HashMap::new();

    for (&key, &value) in group_keys.iter().zip(data_values.iter()) {
        let entry = group_stats.entry(key).or_insert((0, 0.0, 0.0));
        let (count, mean, m2) = *entry;

        let new_count = count + 1;
        let delta = value - mean;
        let new_mean = mean + delta / new_count as f64;
        let delta2 = value - new_mean;
        let new_m2 = m2 + delta * delta2;

        *entry = (new_count, new_mean, new_m2);
    }

    // Compute final result from (count, mean, m2)
    // ...
}

Testing Strategy

Unit Tests (CPU Fallback)

Always write unit tests for CPU fallback implementations:

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_empty_input() {
        let result = my_operation_cpu(&[], &Config::default()).unwrap();
        assert_eq!(result.len(), 0);
    }

    #[test]
    fn test_single_value() {
        let result = my_operation_cpu(&[42.0], &Config::default()).unwrap();
        assert_eq!(result.len(), 1);
    }

    #[test]
    fn test_correctness() {
        let input = vec![1.0, 2.0, 3.0];
        let result = my_operation_cpu(&input, &Config::default()).unwrap();

        // Assert specific values
        assert_eq!(result, vec![/* expected */]);
    }

    #[test]
    fn test_edge_cases() {
        // NaN, infinity, very large/small values
        let input = vec![f64::NAN, f64::INFINITY, 0.0, -0.0];
        let result = my_operation_cpu(&input, &Config::default());

        // Assert proper handling
    }
}

Integration Tests (GPU + Fallback)

#[test]
fn test_cpu_gpu_equivalence() {
    if !capability::is_gpu_available() {
        println!("Skipping equivalence test - GPU not available");
        return;
    }

    let input = generate_test_data();

    // Execute on CPU
    let cpu_result = cpu_fallback::my_operation_cpu(&input, &config).unwrap();

    // Execute on GPU
    let gpu_result = gpu_executor.execute(&input, &config).unwrap();

    // Results should be identical (or within numerical tolerance)
    assert_eq!(cpu_result.len(), gpu_result.len());
    for (cpu_val, gpu_val) in cpu_result.iter().zip(gpu_result.iter()) {
        assert!((cpu_val - gpu_val).abs() < 1e-10);
    }
}

Debugging Tips

Check Capability Detection

// Print full capability report
println!("{}", heliosdb_gpu::capability::capability_report());

// Check specific capability
if !heliosdb_gpu::supports_window_functions() {
    println!("Window functions will use CPU fallback");
}

// Force re-detection (useful during development)
heliosdb_gpu::refresh_capabilities();

Add Logging

use tracing::{info, warn};

pub fn my_operation(&self, data: &[f64]) -> Result<Vec<f64>> {
    if !capability::supports_my_operation() {
        warn!("GPU not available, using CPU fallback for my_operation");
        return cpu_fallback::my_operation_cpu(data);
    }

    info!("Using GPU acceleration for my_operation");
    // GPU implementation...
}

Benchmarking

use std::time::Instant;

// Compare CPU vs GPU performance
let start = Instant::now();
let cpu_result = cpu_fallback::my_operation_cpu(&data, &config)?;
let cpu_time = start.elapsed();

let start = Instant::now();
let gpu_result = gpu_executor.execute(&data, &config)?;
let gpu_time = start.elapsed();

println!("CPU: {:?}, GPU: {:?}, Speedup: {:.2}x",
    cpu_time, gpu_time, cpu_time.as_secs_f64() / gpu_time.as_secs_f64());

Best Practices

1. Always Implement CPU Fallback First

// Good: CPU fallback ensures correctness
pub fn new_operation(&self, data: &[f64]) -> Result<Vec<f64>> {
    if !capability::supports_new_operation() {
        return cpu_fallback::new_operation_cpu(data);
    }
    // GPU implementation (can be added later)
}

// Bad: No fallback, operation will panic
pub fn new_operation(&self, data: &[f64]) -> Result<Vec<f64>> {
    // GPU implementation only
    todo!("Not implemented")
}

2. Use Descriptive Error Messages

// Good: Clear explanation
if !capability::supports_correlation() {
    return Err(Error::OperationNotSupported(
        "Correlation requires GPU acceleration (CPU fallback not yet implemented). \
         Please enable GPU support or use simpler statistical functions.".into()
    ));
}

// Bad: Generic error
if !capability::supports_correlation() {
    return Err(Error::OperationNotSupported("Not supported".into()));
}

3. Test Both Paths

#[test]
fn test_with_cpu_fallback() {
    // Should pass even without GPU
    let result = execute_operation(&data).unwrap();
    assert_correct(result);
}

#[test]
fn test_with_gpu() {
    if !capability::is_gpu_available() {
        return; // Skip if GPU not available
    }
    let result = execute_operation(&data).unwrap();
    assert_correct(result);
}

4. Document Fallback Behavior

/// Execute my GPU-accelerated operation
///
/// # Performance
/// - With GPU: 10-50x speedup over CPU
/// - Without GPU: Falls back to CPU implementation (correct but slower)
///
/// # Arguments
/// * `data` - Input data
///
/// # Returns
/// Processed results (always correct, regardless of GPU availability)
pub fn my_operation(&self, data: &[f64]) -> Result<Vec<f64>> {
    // Implementation...
}

Common Pitfalls

Pitfall 1: Forgetting Capability Check

// Wrong: No capability check
pub fn operation(&self, data: &[f64]) -> Result<Vec<f64>> {
    // Assumes GPU always available
    launch_gpu_kernel(data)?;
    Ok(results)
}

// Correct: Always check capability
pub fn operation(&self, data: &[f64]) -> Result<Vec<f64>> {
    if !capability::supports_operation() {
        return cpu_fallback::operation_cpu(data);
    }
    launch_gpu_kernel(data)?;
    Ok(results)
}

Pitfall 2: Inconsistent Results

// Wrong: CPU and GPU return different results
fn cpu_version() -> f64 { 1.0 / 3.0 }  // 0.333...
fn gpu_version() -> f64 { 0.333 }       // Truncated

// Correct: Both return numerically equivalent results
fn cpu_version() -> f64 { 1.0 / 3.0 }
fn gpu_version() -> f64 { 1.0 / 3.0 }  // Same calculation

Pitfall 3: Ignoring Edge Cases

// Wrong: Doesn't handle empty input
pub fn operation_cpu(data: &[f64]) -> Result<Vec<f64>> {
    let mut result = Vec::new();
    let first = data[0];  // Panic if data is empty!
    // ...
}

// Correct: Handle edge cases
pub fn operation_cpu(data: &[f64]) -> Result<Vec<f64>> {
    if data.is_empty() {
        return Ok(Vec::new());
    }
    let first = data[0];
    // ...
}

Example: Complete Implementation

Here’s a complete example implementing a new GPU operation with fallback:

use heliosdb_common::{Result, HeliosError as Error};
use std::sync::Arc;
use crate::device::GpuDevice;
use crate::memory_v2::{GpuBuffer, AdvancedGpuAllocator};
use crate::capability;
use crate::cpu_fallback;

/// My new GPU-accelerated feature
pub struct GpuMyNewFeature {
    device: Arc<GpuDevice>,
    allocator: Arc<AdvancedGpuAllocator>,
}

impl GpuMyNewFeature {
    pub fn new(
        device: Arc<GpuDevice>,
        allocator: Arc<AdvancedGpuAllocator>,
    ) -> Result<Self> {
        Ok(GpuMyNewFeature { device, allocator })
    }

    /// Execute my feature
    ///
    /// # Performance
    /// - GPU: 20x faster than CPU
    /// - CPU fallback: Always correct
    ///
    /// # Arguments
    /// * `input` - Input data
    ///
    /// # Returns
    /// Processed results
    pub fn execute(&self, input: &[f64]) -> Result<Vec<f64>> {
        // Check GPU capability
        if !capability::supports_my_new_feature() {
            tracing::info!("Using CPU fallback for my_new_feature");
            return cpu_fallback::my_new_feature_cpu(input);
        }

        tracing::info!("Using GPU acceleration for my_new_feature");

        // Allocate GPU memory
        let input_buf = self.allocator.allocate(
            input.len() * 8,
            crate::memory_v2::DeviceId(0)
        )?;

        // Copy to GPU
        input_buf.copy_from_host(input)?;

        // TODO: Launch CUDA kernel
        // - Kernel: my_new_feature.cu::process_kernel
        // - Grid: (input.len() + 255) / 256 blocks
        // - Block: 256 threads

        todo!("GPU kernel implementation pending")
    }
}

// File: heliosdb-gpu/src/cpu_fallback.rs

/// CPU implementation of my_new_feature
pub fn my_new_feature_cpu(input: &[f64]) -> Result<Vec<f64>> {
    let mut result = Vec::with_capacity(input.len());

    for &value in input.iter() {
        // CPU algorithm
        let processed = value * 2.0; // Example
        result.push(processed);
    }

    Ok(result)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_my_new_feature_cpu() {
        let input = vec![1.0, 2.0, 3.0];
        let result = my_new_feature_cpu(&input).unwrap();
        assert_eq!(result, vec![2.0, 4.0, 6.0]);
    }
}

Summary Checklist

When implementing a new GPU operation:

Guide Version: 1.0 Last Updated: 2025-11-24 Questions? See /home/claude/HeliosDB/docs/implementation/GPU_FALLBACK_IMPLEMENTATION_REPORT.md