Production Hardening Developer Guide

Quick Reference: How to use HeliosDB production hardening features

Quick Start
Circuit Breaker
Resource Management
Configuration
Logging
Health Checks
Graceful Shutdown
Best Practices

Quick Start

Add Dependencies

[dependencies]
heliosdb-common = { path = "../heliosdb-common" }
tokio = { version = "1.40", features = ["full"] }
tracing = "0.1"

Basic Setup

use heliosdb_common::{
    production_hardening::*,
    circuit_breaker::*,
    resource_management::*,
    health_endpoints::*,
    graceful_shutdown::*,
};
use std::sync::Arc;

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    // 1. Initialize production config
    let config = ProductionConfig::default();
    let hardening = Arc::new(ProductionHardeningManager::new(config));

    // 2. Set up circuit breaker
    let breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig::default()));

    // 3. Initialize resource manager
    let resources = Arc::new(ResourceManager::new(ResourceManagementConfig::default()));

    // 4. Set up health checks
    let health = Arc::new(HealthCheckSystem::new(HealthCheckConfig::default()));
    health.mark_started().await;

    // 5. Configure graceful shutdown
    let shutdown = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig::default()));

    // Your application logic here

    Ok(())
}

Circuit Breaker

Basic Usage

use heliosdb_common::circuit_breaker::*;

// Create circuit breaker
let breaker = CircuitBreaker::new(CircuitBreakerConfig {
    failure_threshold: 0.5,
    window_duration: Duration::from_secs(60),
    open_duration: Duration::from_secs(30),
    max_retry_attempts: 3,
    initial_backoff: Duration::from_millis(100),
    max_backoff: Duration::from_secs(30),
    backoff_multiplier: 2.0,
});

// Execute with circuit breaker
let result = breaker.call(|| {
    database.execute_query(query)
}).await;

match result {
    Ok(data) => println!("Success: {:?}", data),
    Err(CircuitBreakerError::Open) => println!("Circuit is open, fast-failing"),
    Err(e) => println!("Error: {}", e),
}

With Automatic Retry

// Execute with retry on failure
let result = breaker.call_with_retry(|| {
    external_api.call()
}).await?;

Check Circuit State

// Check current state
let state = breaker.get_state().await;
match state {
    CircuitState::Closed => println!("Normal operation"),
    CircuitState::Open => println!("Circuit is open"),
    CircuitState::HalfOpen => println!("Testing recovery"),
}

// Get metrics
let metrics = breaker.get_metrics().await;
println!("Failure rate: {:.1}%", metrics.failure_rate * 100.0);
println!("Failures: {}, Successes: {}", metrics.failures, metrics.successes);

Reset Circuit

// Manually reset circuit (use with caution)
breaker.reset().await;

Resource Management

Connection Pooling

use heliosdb_common::resource_management::*;

// Create resource manager
let manager = ResourceManager::new(ResourceManagementConfig {
    connection_pool_size: 100,
    connection_idle_timeout: Duration::from_secs(300),
    connection_max_lifetime: Duration::from_secs(3600),
    ..Default::default()
});

// Acquire connection
let conn = manager.acquire_connection().await?;
// Connection automatically returned to pool when dropped

// Get pool statistics
let metrics = manager.get_metrics().await;
println!("Active: {}, Idle: {}, Utilization: {:.1}%",
    metrics.active_connections,
    metrics.idle_connections,
    metrics.connection_pool_utilization * 100.0
);

Memory Management

// Allocate memory with tracking
let memory = manager.allocate_memory(1024 * 1024).await?; // 1 MB
// Memory automatically freed when guard dropped

// Check memory usage
let metrics = manager.get_metrics().await;
println!("Memory: {} / {} bytes ({:.1}%)",
    metrics.memory_used_bytes,
    metrics.memory_limit_bytes,
    metrics.memory_usage_percent
);

Resource Tracking

// Track any resource
let file_handle = std::fs::File::open("data.db")?;
let tracked = manager.track_resource(file_handle, "database_file").await;

// Use resource
tracked.get().read_to_string(&mut buffer)?;

// Resource automatically cleaned up when dropped

Background Cleanup

// Run cleanup loop in background
let manager_clone = manager.clone();
tokio::spawn(async move {
    manager_clone.run_cleanup_loop().await;
});

Configuration

Basic Configuration

use heliosdb_common::config_management::*;
use std::path::PathBuf;

// Define your configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AppConfig {
    pub database_url: String,
    pub port: u16,
    pub max_connections: usize,
}

// Implement Configurable trait
impl Configurable for AppConfig {
    fn validate(&self) -> Result<(), ConfigError> {
        ConfigValidator::validate_port(self.port)?;
        ConfigValidator::validate_not_empty(&self.database_url, "database_url")?;
        Ok(())
    }

    fn from_file(path: &PathBuf) -> Result<Self, ConfigError> {
        let contents = std::fs::read_to_string(path)?;
        toml::from_str(&contents).map_err(|e| ConfigError::Invalid(e.to_string()))
    }

    fn merge(&mut self, other: Self) {
        *self = other;
    }
}

// Create config manager
let initial_config = AppConfig {
    database_url: "postgres://localhost".to_string(),
    port: 5432,
    max_connections: 100,
};

let config_mgmt = ConfigManagementConfig {
    environment: Environment::Production,
    config_file: PathBuf::from("/etc/app/config.toml"),
    enable_hot_reload: true,
    ..Default::default()
};

let manager = ConfigManager::new(initial_config, config_mgmt).await?;

// Get current config
let config = manager.get_config().await;

// Reload config
manager.reload_config().await?;

Vault Integration

// Enable Vault
let config_mgmt = ConfigManagementConfig {
    enable_vault: true,
    vault_address: Some("https://vault.example.com".to_string()),
    vault_token: Some(std::env::var("VAULT_TOKEN")?),
    ..Default::default()
};

let manager = ConfigManager::new(initial_config, config_mgmt).await?;

// Get secret from Vault
let db_password = manager.get_secret("secret/database/password").await?;

Hot Reload

// Watch for config changes
let manager_clone = manager.clone();
tokio::spawn(async move {
    manager_clone.watch_for_changes().await;
});

Environment Variables

use heliosdb_common::config_management::EnvConfigProvider;

// Get environment variable
let db_host = EnvConfigProvider::get("DATABASE_HOST")
    .unwrap_or_else(|| "localhost".to_string());

// Get with default
let port = EnvConfigProvider::get_or_default("PORT", "5432");

// Parse typed value
let max_conn: usize = EnvConfigProvider::get_parsed("MAX_CONNECTIONS")
    .unwrap_or(100);

Logging

Basic Setup

use heliosdb_common::structured_logging::*;

// Configure logging
let config = LoggingConfig {
    level: "info".to_string(),
    enable_pii_redaction: true,
    structured: true,
    ..Default::default()
};

let logger = StructuredLogger::new(config)?;
logger.init()?;

// Use standard tracing macros
tracing::info!("Application started");
tracing::warn!("High memory usage detected");
tracing::error!("Database connection failed");

PII Redaction

// PII is automatically redacted in logs
let user_email = "john.doe@example.com";
tracing::info!("User logged in: {}", user_email);
// Output: "User logged in: [REDACTED]"

// Custom PII patterns
let config = LoggingConfig {
    pii_patterns: vec![
        r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b".to_string(),
        r"\b\d{3}-\d{2}-\d{4}\b".to_string(),  // SSN
        r"my-custom-pattern".to_string(),
    ],
    ..Default::default()
};

Log with Context

use tracing::{info_span, instrument};

// Create span for context
let span = info_span!("process_request", request_id = %request_id);
let _enter = span.enter();

tracing::info!("Processing request");  // Includes request_id in output

// Or use instrument macro
#[instrument(skip(database))]
async fn process_user(user_id: u64, database: &Database) {
    tracing::info!("Processing user");  // Includes user_id automatically
}

Troubleshooting Utilities

use heliosdb_common::structured_logging::TroubleshootingUtils;

// Dump state for debugging
let state = get_current_state();
let dump = TroubleshootingUtils::dump_state(&state, "component_name");
println!("{}", dump);

// Create diagnostic snapshot
let snapshot = TroubleshootingUtils::create_diagnostic_snapshot();
println!("Hostname: {}", snapshot.get("hostname").unwrap());
println!("Memory: {} KB", snapshot.get("free_memory_kb").unwrap());

// Format error chain
let error = some_operation()?;
let chain = TroubleshootingUtils::format_error_chain(&error);
tracing::error!("Operation failed:\n{}", chain);

Health Checks

Basic Setup

use heliosdb_common::health_endpoints::*;
use std::sync::Arc;

// Create health check system
let health = Arc::new(HealthCheckSystem::new(HealthCheckConfig {
    check_interval: Duration::from_secs(10),
    check_timeout: Duration::from_secs(5),
    ..Default::default()
}));

// Register components
health.register_component(Arc::new(StorageHealthCheck::new("storage".to_string()))).await;
health.register_component(Arc::new(DatabaseHealthCheck::new("database".to_string()))).await;

// Mark application as started
health.mark_started().await;

// Serve health endpoints
let health_clone = health.clone();
tokio::spawn(async move {
    serve_health_endpoints(health_clone, "0.0.0.0:8080".parse().unwrap()).await.unwrap();
});

Custom Health Checks

use heliosdb_common::health_endpoints::HealthCheckable;

// Implement custom health check
struct MyServiceHealthCheck {
    service: Arc<MyService>,
}

impl HealthCheckable for MyServiceHealthCheck {
    async fn check_health(&self) -> Result<(), String> {
        // Check if service is healthy
        if self.service.is_connected().await {
            Ok(())
        } else {
            Err("Service not connected".to_string())
        }
    }

    fn component_name(&self) -> &str {
        "my_service"
    }
}

// Register custom check
health.register_component(Arc::new(MyServiceHealthCheck {
    service: my_service.clone(),
})).await;

Check Health Programmatically

// Check liveness
let liveness = health.liveness().await;
if liveness.status == HealthStatus::Healthy {
    println!("Application is alive");
}

// Check readiness
let readiness = health.readiness().await;
if readiness.status == HealthStatus::Healthy {
    println!("Ready to serve traffic");
}

// Check startup
let startup = health.startup().await;
if startup.status == HealthStatus::Healthy {
    println!("Startup complete");
}

Background Health Monitoring

// Run health checks in background
let health_clone = health.clone();
tokio::spawn(async move {
    health_clone.run_health_checks().await;
});

Graceful Shutdown

Basic Setup

use heliosdb_common::graceful_shutdown::*;

// Create shutdown manager
let shutdown = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig {
    shutdown_timeout: Duration::from_secs(30),
    connection_drain_timeout: Duration::from_secs(15),
    inflight_timeout: Duration::from_secs(10),
    cleanup_timeout: Duration::from_secs(5),
}));

// Register shutdown handlers
shutdown.register_handler(Arc::new(
    ConnectionPoolShutdownHandler::new("main_pool".to_string())
)).await;

// Listen for signals
let shutdown_clone = shutdown.clone();
tokio::spawn(async move {
    shutdown_clone.listen_for_signals().await.unwrap();
});

Custom Shutdown Handlers

use heliosdb_common::graceful_shutdown::ShutdownHandler;

struct MyServiceShutdownHandler {
    service: Arc<MyService>,
}

impl ShutdownHandler for MyServiceShutdownHandler {
    async fn on_shutdown_start(&self) -> Result<(), String> {
        tracing::info!("Starting service shutdown");
        Ok(())
    }

    async fn drain_connections(&self) -> Result<(), String> {
        self.service.stop_accepting_connections().await;
        Ok(())
    }

    async fn complete_inflight(&self) -> Result<(), String> {
        self.service.wait_for_requests().await;
        Ok(())
    }

    async fn cleanup_resources(&self) -> Result<(), String> {
        self.service.cleanup().await;
        Ok(())
    }
}

// Register handler
shutdown.register_handler(Arc::new(MyServiceShutdownHandler {
    service: my_service.clone(),
})).await;

Check Shutdown State

// Check if shutting down
if shutdown.is_shutting_down().await {
    // Stop accepting new work
    return Ok(());
}

// Get current phase
let phase = shutdown.get_shutdown_phase().await;
match phase {
    ShutdownPhase::Running => println!("Normal operation"),
    ShutdownPhase::DrainConnections => println!("Draining connections"),
    ShutdownPhase::CompleteInFlight => println!("Completing requests"),
    ShutdownPhase::CleanupResources => println!("Cleaning up"),
    ShutdownPhase::Shutdown => println!("Shutdown complete"),
}

// Get metrics
let metrics = shutdown.get_metrics().await;
println!("Connections drained: {}", metrics.connections_drained);
println!("In-flight completed: {}", metrics.inflight_completed);

Manual Shutdown

// Trigger shutdown programmatically
shutdown.execute_shutdown().await?;

Best Practices

1. Circuit Breaker

DO:

Use circuit breakers for external service calls
Configure failure thresholds based on SLAs
Monitor circuit breaker state
Use retry with exponential backoff
Reset circuit only when safe

❌ DON’T:

Use for internal function calls
Set failure threshold too low (causes flapping)
Ignore circuit open state
Retry without backoff

2. Resource Management

DO:

Set appropriate pool sizes for workload
Configure memory limits per environment
Enable leak detection in production
Monitor resource metrics
Use RAII patterns (guards)

❌ DON’T:

Create unlimited resources
Ignore resource warnings
Disable cleanup
Mix manual and automatic management

3. Configuration

DO:

Validate configuration on load
Use environment-specific configs
Store secrets in Vault
Enable hot reload for non-critical configs
Use environment variables for overrides

❌ DON’T:

Store secrets in config files
Skip validation
Reload critical configs without restart
Use same config for all environments

4. Logging

DO:

Enable PII redaction in production
Use structured logging (JSON)
Set appropriate log levels (INFO for prod)
Add contextual information
Rotate logs regularly

❌ DON’T:

Log sensitive data directly
Use DEBUG level in production
Log excessively in hot paths
Ignore log sampling for high volume

5. Health Checks

DO:

Implement all three probes (liveness, readiness, startup)
Set appropriate timeouts
Check critical dependencies
Return quickly (< 5s)
Use background health monitoring

❌ DON’T:

Perform expensive operations in health checks
Skip dependency checks
Ignore health check failures
Set timeouts too short

6. Graceful Shutdown

DO:

Implement all shutdown phases
Set realistic timeouts
Handle signals properly
Wait for in-flight requests
Clean up all resources

❌ DON’T:

Force immediate shutdown
Ignore in-flight requests
Skip resource cleanup
Set timeouts too short

Common Patterns

Pattern 1: Service with Full Hardening

struct MyService {
    circuit_breaker: Arc<CircuitBreaker>,
    resource_manager: Arc<ResourceManager>,
    health_system: Arc<HealthCheckSystem>,
    shutdown_manager: Arc<GracefulShutdownManager>,
}

impl MyService {
    async fn new() -> Result<Self, Box<dyn std::error::Error>> {
        let circuit_breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig::default()));
        let resource_manager = Arc::new(ResourceManager::new(ResourceManagementConfig::default()));
        let health_system = Arc::new(HealthCheckSystem::new(HealthCheckConfig::default()));
        let shutdown_manager = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig::default()));

        health_system.mark_started().await;

        Ok(Self {
            circuit_breaker,
            resource_manager,
            health_system,
            shutdown_manager,
        })
    }

    async fn call_external_api(&self, request: Request) -> Result<Response, Error> {
        // Check if shutting down
        if self.shutdown_manager.is_shutting_down().await {
            return Err(Error::ShuttingDown);
        }

        // Use circuit breaker with retry
        let result = self.circuit_breaker.call_with_retry(|| {
            external_api.call(request.clone())
        }).await?;

        Ok(result)
    }
}

Pattern 2: Database with Resource Management

struct Database {
    resource_manager: Arc<ResourceManager>,
}

impl Database {
    async fn execute_query(&self, query: &str) -> Result<Vec<Row>, Error> {
        // Acquire connection from pool
        let conn = self.resource_manager.acquire_connection().await?;

        // Allocate memory for results
        let mem = self.resource_manager.allocate_memory(estimated_size).await?;

        // Execute query
        let rows = self.execute_with_connection(&conn, query).await?;

        Ok(rows)
        // Connection and memory automatically released
    }
}

Pattern 3: HTTP Server with Health Checks

async fn start_server(
    health_system: Arc<HealthCheckSystem>,
    shutdown_manager: Arc<GracefulShutdownManager>,
) -> Result<(), Box<dyn std::error::Error>> {
    // Serve health endpoints
    tokio::spawn(async move {
        serve_health_endpoints(health_system, "0.0.0.0:8080".parse().unwrap()).await
    });

    // Serve main application
    let server = warp::serve(routes)
        .bind_with_graceful_shutdown(
            ([0, 0, 0, 0], 3000),
            async move {
                shutdown_manager.wait_for_shutdown_signal().await;
            }
        );

    server.await;

    Ok(())
}

Troubleshooting

Circuit Breaker Stuck Open

Problem: Circuit breaker won’t close

Solution:

// Check metrics
let metrics = breaker.get_metrics().await;
println!("Failures: {}, Successes: {}", metrics.failures, metrics.successes);

// Wait for open_duration to elapse
// Circuit will transition to half-open and test

// If necessary, reset manually (use with caution)
breaker.reset().await;

Resource Leaks Detected

Problem: Leak detection reports leaks

Solution:

// Get resource metrics
let metrics = manager.get_metrics().await;
println!("Leaks detected: {}", metrics.resource_leaks_detected);

// Enable detailed tracking
let config = ResourceManagementConfig {
    enable_leak_detection: true,
    leak_detection_interval: Duration::from_secs(60),
    ..Default::default()
};

// Check specific resources
// Ensure all resources are properly dropped

Health Checks Failing

Problem: Health checks timeout or fail

Solution:

// Check timeout configuration
let config = HealthCheckConfig {
    check_timeout: Duration::from_secs(10),  // Increase if needed
    ..Default::default()
};

// Check individual component
let readiness = health_system.readiness().await;
for (name, check) in &readiness.checks {
    if check.status != HealthStatus::Healthy {
        println!("Component {} failed: {:?}", name, check.message);
    }
}

Graceful Shutdown Timeout

Problem: Shutdown takes too long or times out

Solution:

// Increase timeouts
let config = GracefulShutdownConfig {
    shutdown_timeout: Duration::from_secs(60),  // Increase total
    connection_drain_timeout: Duration::from_secs(30),
    ..Default::default()
};

// Check what's taking time
let metrics = shutdown_manager.get_metrics().await;
println!("Phase: {}", metrics.phase);
println!("Active connections: {}", metrics.connections_active);

Additional Resources

Guide Version: 1.0.0 Last Updated: November 24, 2025 Maintainers: HeliosDB Core Team

Production Hardening Developer Guide

Production Hardening Developer Guide

Table of Contents

Quick Start

Add Dependencies

Basic Setup

Circuit Breaker

Basic Usage

With Automatic Retry

Check Circuit State

Reset Circuit

Resource Management

Connection Pooling

Memory Management

Resource Tracking

Background Cleanup

Configuration

Basic Configuration

Vault Integration

Hot Reload

Environment Variables

Logging

Basic Setup

PII Redaction

Log with Context

Troubleshooting Utilities

Health Checks

Basic Setup

Custom Health Checks

Check Health Programmatically

Background Health Monitoring

Graceful Shutdown

Basic Setup

Custom Shutdown Handlers

Check Shutdown State

Manual Shutdown

Best Practices

1. Circuit Breaker

2. Resource Management

3. Configuration

4. Logging

5. Health Checks

6. Graceful Shutdown

Common Patterns

Pattern 1: Service with Full Hardening

Pattern 2: Database with Resource Management

Pattern 3: HTTP Server with Health Checks

Troubleshooting

Circuit Breaker Stuck Open

Resource Leaks Detected

Health Checks Failing

Graceful Shutdown Timeout

Additional Resources