Skip to content

GPU Fallback Developer Guide

GPU Fallback Developer Guide

Last Updated: 2025-11-24 Audience: HeliosDB Developers Topic: GPU Operation Fallback Patterns

Quick Reference

Check GPU Capabilities

use heliosdb_gpu::capability;
// Check if GPU is available at all
if capability::is_gpu_available() {
println!("GPU detected");
}
// Check specific operation support
if capability::supports_window_functions() {
println!("Window functions can use GPU");
}
if capability::supports_advanced_aggregations() {
println!("Advanced aggregations can use GPU");
}
if capability::supports_hash_joins() {
println!("Hash joins can use GPU");
}
// Get full capability report
println!("{}", capability::capability_report());

Implement GPU Operation with Fallback

Use when CPU implementation is straightforward:

use crate::capability;
use crate::cpu_fallback;
pub fn my_gpu_operation(
&self,
data: &[f64],
config: &Config,
) -> Result<Vec<f64>> {
// Step 1: Check GPU capability
if !capability::supports_my_operation() {
// Step 2: Fallback to CPU implementation
return cpu_fallback::my_operation_cpu(data, config);
}
// Step 3: GPU implementation (if available)
// ... allocate GPU memory ...
// ... launch CUDA kernel ...
// ... return results ...
todo!("GPU kernel implementation pending")
}

Pattern 2: Clear Error Message

Use when CPU implementation is complex and not yet available:

use crate::capability;
pub fn complex_gpu_operation(
&self,
data: &ComplexData,
) -> Result<ComplexResult> {
// Step 1: Check GPU capability
if !capability::supports_complex_operation() {
// Step 2: Return clear error
return Err(Error::OperationNotSupported(
"Complex operation requires GPU (CPU fallback not yet implemented)".into()
));
}
// Step 3: GPU implementation
todo!("GPU kernel implementation pending")
}

Adding New GPU Operations

Step 1: Add Capability Flag

Edit /home/claude/HeliosDB/heliosdb-gpu/src/capability.rs:

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GpuCapability {
// ... existing capabilities ...
/// Your new operation
MyNewOperation,
}

Add detection:

impl CapabilityCache {
fn detect(&mut self) {
// ... existing detection ...
self.capabilities.insert(
GpuCapability::MyNewOperation,
self.detect_my_operation_kernels(),
);
}
fn detect_my_operation_kernels(&self) -> bool {
// Return true if kernels are compiled and available
// For now, return false until GPU kernels are implemented
false
}
}

Add public function:

/// Check if my operation is supported on GPU
pub fn supports_my_operation() -> bool {
has_capability(GpuCapability::DeviceAvailable)
&& has_capability(GpuCapability::MyNewOperation)
}

Update capability report:

pub fn capability_report() -> String {
// ... existing code ...
let cap_names = [
// ... existing capabilities ...
(GpuCapability::MyNewOperation, "My New Operation"),
];
// ... rest of function ...
}

Step 2: Implement CPU Fallback

Edit /home/claude/HeliosDB/heliosdb-gpu/src/cpu_fallback.rs:

/// CPU implementation of my operation
pub fn my_operation_cpu(
data: &[f64],
config: &Config,
) -> Result<Vec<f64>> {
let mut result = Vec::with_capacity(data.len());
// Implement algorithm
for &value in data.iter() {
let processed = /* your CPU algorithm */;
result.push(processed);
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_my_operation_cpu() {
let data = vec![1.0, 2.0, 3.0];
let config = Config::default();
let result = my_operation_cpu(&data, &config).unwrap();
// Assert correctness
assert_eq!(result.len(), data.len());
// ... more assertions ...
}
}

Step 3: Implement GPU Operation with Fallback

Edit your GPU module (e.g., /home/claude/HeliosDB/heliosdb-gpu/src/my_operations.rs):

use heliosdb_common::{Result, HeliosError as Error};
use std::sync::Arc;
use crate::device::GpuDevice;
use crate::memory_v2::{GpuBuffer, AdvancedGpuAllocator};
use crate::capability;
use crate::cpu_fallback;
pub struct GpuMyOperations {
device: Arc<GpuDevice>,
allocator: Arc<AdvancedGpuAllocator>,
}
impl GpuMyOperations {
pub fn new(
device: Arc<GpuDevice>,
allocator: Arc<AdvancedGpuAllocator>,
) -> Result<Self> {
Ok(GpuMyOperations { device, allocator })
}
pub fn execute(
&self,
data: &[f64],
config: &Config,
) -> Result<Vec<f64>> {
let start = std::time::Instant::now();
// Check GPU capability
if !capability::supports_my_operation() {
// Fallback to CPU
return cpu_fallback::my_operation_cpu(data, config);
}
// Allocate GPU memory
let data_buf = self.allocator.allocate(
data.len() * 8,
crate::memory_v2::DeviceId(0)
)?;
// Copy data to GPU
data_buf.copy_from_host(data)?;
// TODO: Launch CUDA kernel
// - Kernel: my_operations.cu::my_kernel
// - Grid/block configuration
// - Launch and synchronize
todo!("GPU kernel implementation pending")
}
}

Step 4: Export from lib.rs

Edit /home/claude/HeliosDB/heliosdb-gpu/src/lib.rs:

pub mod my_operations;
pub use my_operations::{
GpuMyOperations,
// ... other exports ...
};
// Update capability exports if needed
pub use capability::{
// ... existing exports ...
supports_my_operation,
};

Step 5: Write Tests

Create /home/claude/HeliosDB/heliosdb-gpu/tests/my_operations_tests.rs:

use heliosdb_gpu::{GpuMyOperations, capability};
#[test]
fn test_my_operation_cpu_fallback() {
// This test should pass even without GPU
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let config = Config::default();
// Even if GPU not available, should get correct CPU results
let result = execute_my_operation(&data, &config).unwrap();
assert_eq!(result.len(), data.len());
// Assert correctness...
}
#[test]
fn test_my_operation_with_gpu() {
if !capability::supports_my_operation() {
println!("Skipping GPU test - GPU not available");
return;
}
// GPU-specific test
let data = vec![/* large dataset */];
let result = execute_my_operation(&data, &config).unwrap();
// Assert GPU-specific behavior (performance, etc.)
}

Common Patterns

Pattern: Partition-Based Operations

pub fn partition_operation_cpu(
partition_keys: &[i32],
data_values: &[f64],
) -> Result<Vec<f64>> {
let mut result = vec![0.0; data_values.len()];
let mut current_partition = None;
let mut partition_state = State::default();
for (idx, (&key, &value)) in partition_keys.iter().zip(data_values.iter()).enumerate() {
if current_partition != Some(key) {
// New partition - reset state
current_partition = Some(key);
partition_state = State::default();
}
// Process within partition
partition_state.update(value);
result[idx] = partition_state.compute();
}
Ok(result)
}

Pattern: Group-Based Aggregations

pub fn group_aggregation_cpu(
group_keys: &[i32],
data_values: &[f64],
) -> Result<Vec<(i32, f64)>> {
let mut group_data: HashMap<i32, AggregateState> = HashMap::new();
// Accumulate
for (&key, &value) in group_keys.iter().zip(data_values.iter()) {
let state = group_data.entry(key).or_default();
state.accumulate(value);
}
// Finalize
let mut result: Vec<(i32, f64)> = group_data
.into_iter()
.map(|(key, state)| (key, state.finalize()))
.collect();
result.sort_by_key(|(k, _)| *k);
Ok(result)
}

Pattern: Numerically Stable Algorithms

// Welford's algorithm for variance/stddev
pub fn welford_cpu(
group_keys: &[i32],
data_values: &[f64],
) -> Result<Vec<(i32, f64)>> {
let mut group_stats: HashMap<i32, (usize, f64, f64)> = HashMap::new();
for (&key, &value) in group_keys.iter().zip(data_values.iter()) {
let entry = group_stats.entry(key).or_insert((0, 0.0, 0.0));
let (count, mean, m2) = *entry;
let new_count = count + 1;
let delta = value - mean;
let new_mean = mean + delta / new_count as f64;
let delta2 = value - new_mean;
let new_m2 = m2 + delta * delta2;
*entry = (new_count, new_mean, new_m2);
}
// Compute final result from (count, mean, m2)
// ...
}

Testing Strategy

Unit Tests (CPU Fallback)

Always write unit tests for CPU fallback implementations:

#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_input() {
let result = my_operation_cpu(&[], &Config::default()).unwrap();
assert_eq!(result.len(), 0);
}
#[test]
fn test_single_value() {
let result = my_operation_cpu(&[42.0], &Config::default()).unwrap();
assert_eq!(result.len(), 1);
}
#[test]
fn test_correctness() {
let input = vec![1.0, 2.0, 3.0];
let result = my_operation_cpu(&input, &Config::default()).unwrap();
// Assert specific values
assert_eq!(result, vec![/* expected */]);
}
#[test]
fn test_edge_cases() {
// NaN, infinity, very large/small values
let input = vec![f64::NAN, f64::INFINITY, 0.0, -0.0];
let result = my_operation_cpu(&input, &Config::default());
// Assert proper handling
}
}

Integration Tests (GPU + Fallback)

#[test]
fn test_cpu_gpu_equivalence() {
if !capability::is_gpu_available() {
println!("Skipping equivalence test - GPU not available");
return;
}
let input = generate_test_data();
// Execute on CPU
let cpu_result = cpu_fallback::my_operation_cpu(&input, &config).unwrap();
// Execute on GPU
let gpu_result = gpu_executor.execute(&input, &config).unwrap();
// Results should be identical (or within numerical tolerance)
assert_eq!(cpu_result.len(), gpu_result.len());
for (cpu_val, gpu_val) in cpu_result.iter().zip(gpu_result.iter()) {
assert!((cpu_val - gpu_val).abs() < 1e-10);
}
}

Debugging Tips

Check Capability Detection

// Print full capability report
println!("{}", heliosdb_gpu::capability::capability_report());
// Check specific capability
if !heliosdb_gpu::supports_window_functions() {
println!("Window functions will use CPU fallback");
}
// Force re-detection (useful during development)
heliosdb_gpu::refresh_capabilities();

Add Logging

use tracing::{info, warn};
pub fn my_operation(&self, data: &[f64]) -> Result<Vec<f64>> {
if !capability::supports_my_operation() {
warn!("GPU not available, using CPU fallback for my_operation");
return cpu_fallback::my_operation_cpu(data);
}
info!("Using GPU acceleration for my_operation");
// GPU implementation...
}

Benchmarking

use std::time::Instant;
// Compare CPU vs GPU performance
let start = Instant::now();
let cpu_result = cpu_fallback::my_operation_cpu(&data, &config)?;
let cpu_time = start.elapsed();
let start = Instant::now();
let gpu_result = gpu_executor.execute(&data, &config)?;
let gpu_time = start.elapsed();
println!("CPU: {:?}, GPU: {:?}, Speedup: {:.2}x",
cpu_time, gpu_time, cpu_time.as_secs_f64() / gpu_time.as_secs_f64());

Best Practices

1. Always Implement CPU Fallback First

// Good: CPU fallback ensures correctness
pub fn new_operation(&self, data: &[f64]) -> Result<Vec<f64>> {
if !capability::supports_new_operation() {
return cpu_fallback::new_operation_cpu(data);
}
// GPU implementation (can be added later)
}
// Bad: No fallback, operation will panic
pub fn new_operation(&self, data: &[f64]) -> Result<Vec<f64>> {
// GPU implementation only
todo!("Not implemented")
}

2. Use Descriptive Error Messages

// Good: Clear explanation
if !capability::supports_correlation() {
return Err(Error::OperationNotSupported(
"Correlation requires GPU acceleration (CPU fallback not yet implemented). \
Please enable GPU support or use simpler statistical functions.".into()
));
}
// Bad: Generic error
if !capability::supports_correlation() {
return Err(Error::OperationNotSupported("Not supported".into()));
}

3. Test Both Paths

#[test]
fn test_with_cpu_fallback() {
// Should pass even without GPU
let result = execute_operation(&data).unwrap();
assert_correct(result);
}
#[test]
fn test_with_gpu() {
if !capability::is_gpu_available() {
return; // Skip if GPU not available
}
let result = execute_operation(&data).unwrap();
assert_correct(result);
}

4. Document Fallback Behavior

/// Execute my GPU-accelerated operation
///
/// # Performance
/// - With GPU: 10-50x speedup over CPU
/// - Without GPU: Falls back to CPU implementation (correct but slower)
///
/// # Arguments
/// * `data` - Input data
///
/// # Returns
/// Processed results (always correct, regardless of GPU availability)
pub fn my_operation(&self, data: &[f64]) -> Result<Vec<f64>> {
// Implementation...
}

Common Pitfalls

Pitfall 1: Forgetting Capability Check

// Wrong: No capability check
pub fn operation(&self, data: &[f64]) -> Result<Vec<f64>> {
// Assumes GPU always available
launch_gpu_kernel(data)?;
Ok(results)
}
// Correct: Always check capability
pub fn operation(&self, data: &[f64]) -> Result<Vec<f64>> {
if !capability::supports_operation() {
return cpu_fallback::operation_cpu(data);
}
launch_gpu_kernel(data)?;
Ok(results)
}

Pitfall 2: Inconsistent Results

// Wrong: CPU and GPU return different results
fn cpu_version() -> f64 { 1.0 / 3.0 } // 0.333...
fn gpu_version() -> f64 { 0.333 } // Truncated
// Correct: Both return numerically equivalent results
fn cpu_version() -> f64 { 1.0 / 3.0 }
fn gpu_version() -> f64 { 1.0 / 3.0 } // Same calculation

Pitfall 3: Ignoring Edge Cases

// Wrong: Doesn't handle empty input
pub fn operation_cpu(data: &[f64]) -> Result<Vec<f64>> {
let mut result = Vec::new();
let first = data[0]; // Panic if data is empty!
// ...
}
// Correct: Handle edge cases
pub fn operation_cpu(data: &[f64]) -> Result<Vec<f64>> {
if data.is_empty() {
return Ok(Vec::new());
}
let first = data[0];
// ...
}

Example: Complete Implementation

Here’s a complete example implementing a new GPU operation with fallback:

heliosdb-gpu/src/my_new_feature.rs
use heliosdb_common::{Result, HeliosError as Error};
use std::sync::Arc;
use crate::device::GpuDevice;
use crate::memory_v2::{GpuBuffer, AdvancedGpuAllocator};
use crate::capability;
use crate::cpu_fallback;
/// My new GPU-accelerated feature
pub struct GpuMyNewFeature {
device: Arc<GpuDevice>,
allocator: Arc<AdvancedGpuAllocator>,
}
impl GpuMyNewFeature {
pub fn new(
device: Arc<GpuDevice>,
allocator: Arc<AdvancedGpuAllocator>,
) -> Result<Self> {
Ok(GpuMyNewFeature { device, allocator })
}
/// Execute my feature
///
/// # Performance
/// - GPU: 20x faster than CPU
/// - CPU fallback: Always correct
///
/// # Arguments
/// * `input` - Input data
///
/// # Returns
/// Processed results
pub fn execute(&self, input: &[f64]) -> Result<Vec<f64>> {
// Check GPU capability
if !capability::supports_my_new_feature() {
tracing::info!("Using CPU fallback for my_new_feature");
return cpu_fallback::my_new_feature_cpu(input);
}
tracing::info!("Using GPU acceleration for my_new_feature");
// Allocate GPU memory
let input_buf = self.allocator.allocate(
input.len() * 8,
crate::memory_v2::DeviceId(0)
)?;
// Copy to GPU
input_buf.copy_from_host(input)?;
// TODO: Launch CUDA kernel
// - Kernel: my_new_feature.cu::process_kernel
// - Grid: (input.len() + 255) / 256 blocks
// - Block: 256 threads
todo!("GPU kernel implementation pending")
}
}
// File: heliosdb-gpu/src/cpu_fallback.rs
/// CPU implementation of my_new_feature
pub fn my_new_feature_cpu(input: &[f64]) -> Result<Vec<f64>> {
let mut result = Vec::with_capacity(input.len());
for &value in input.iter() {
// CPU algorithm
let processed = value * 2.0; // Example
result.push(processed);
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_my_new_feature_cpu() {
let input = vec![1.0, 2.0, 3.0];
let result = my_new_feature_cpu(&input).unwrap();
assert_eq!(result, vec![2.0, 4.0, 6.0]);
}
}

Summary Checklist

When implementing a new GPU operation:

  • Add capability flag to capability.rs
  • Implement CPU fallback in cpu_fallback.rs
  • Write unit tests for CPU fallback
  • Implement GPU operation with fallback check
  • Add operation to module exports in lib.rs
  • Write integration tests (CPU and GPU paths)
  • Document fallback behavior
  • Add logging for fallback usage
  • Verify edge cases handled
  • Ensure CPU/GPU numerical equivalence

Guide Version: 1.0 Last Updated: 2025-11-24 Questions? See /home/claude/HeliosDB/docs/implementation/GPU_FALLBACK_IMPLEMENTATION_REPORT.md