Production Hardening Developer Guide
Production Hardening Developer Guide
Quick Reference: How to use HeliosDB production hardening features
Table of Contents
- Quick Start
- Circuit Breaker
- Resource Management
- Configuration
- Logging
- Health Checks
- Graceful Shutdown
- Best Practices
Quick Start
Add Dependencies
[dependencies]heliosdb-common = { path = "../heliosdb-common" }tokio = { version = "1.40", features = ["full"] }tracing = "0.1"Basic Setup
use heliosdb_common::{ production_hardening::*, circuit_breaker::*, resource_management::*, health_endpoints::*, graceful_shutdown::*,};use std::sync::Arc;
#[tokio::main]async fn main() -> Result<(), Box<dyn std::error::Error>> { // 1. Initialize production config let config = ProductionConfig::default(); let hardening = Arc::new(ProductionHardeningManager::new(config));
// 2. Set up circuit breaker let breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig::default()));
// 3. Initialize resource manager let resources = Arc::new(ResourceManager::new(ResourceManagementConfig::default()));
// 4. Set up health checks let health = Arc::new(HealthCheckSystem::new(HealthCheckConfig::default())); health.mark_started().await;
// 5. Configure graceful shutdown let shutdown = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig::default()));
// Your application logic here
Ok(())}Circuit Breaker
Basic Usage
use heliosdb_common::circuit_breaker::*;
// Create circuit breakerlet breaker = CircuitBreaker::new(CircuitBreakerConfig { failure_threshold: 0.5, window_duration: Duration::from_secs(60), open_duration: Duration::from_secs(30), max_retry_attempts: 3, initial_backoff: Duration::from_millis(100), max_backoff: Duration::from_secs(30), backoff_multiplier: 2.0,});
// Execute with circuit breakerlet result = breaker.call(|| { database.execute_query(query)}).await;
match result { Ok(data) => println!("Success: {:?}", data), Err(CircuitBreakerError::Open) => println!("Circuit is open, fast-failing"), Err(e) => println!("Error: {}", e),}With Automatic Retry
// Execute with retry on failurelet result = breaker.call_with_retry(|| { external_api.call()}).await?;Check Circuit State
// Check current statelet state = breaker.get_state().await;match state { CircuitState::Closed => println!("Normal operation"), CircuitState::Open => println!("Circuit is open"), CircuitState::HalfOpen => println!("Testing recovery"),}
// Get metricslet metrics = breaker.get_metrics().await;println!("Failure rate: {:.1}%", metrics.failure_rate * 100.0);println!("Failures: {}, Successes: {}", metrics.failures, metrics.successes);Reset Circuit
// Manually reset circuit (use with caution)breaker.reset().await;Resource Management
Connection Pooling
use heliosdb_common::resource_management::*;
// Create resource managerlet manager = ResourceManager::new(ResourceManagementConfig { connection_pool_size: 100, connection_idle_timeout: Duration::from_secs(300), connection_max_lifetime: Duration::from_secs(3600), ..Default::default()});
// Acquire connectionlet conn = manager.acquire_connection().await?;// Connection automatically returned to pool when dropped
// Get pool statisticslet metrics = manager.get_metrics().await;println!("Active: {}, Idle: {}, Utilization: {:.1}%", metrics.active_connections, metrics.idle_connections, metrics.connection_pool_utilization * 100.0);Memory Management
// Allocate memory with trackinglet memory = manager.allocate_memory(1024 * 1024).await?; // 1 MB// Memory automatically freed when guard dropped
// Check memory usagelet metrics = manager.get_metrics().await;println!("Memory: {} / {} bytes ({:.1}%)", metrics.memory_used_bytes, metrics.memory_limit_bytes, metrics.memory_usage_percent);Resource Tracking
// Track any resourcelet file_handle = std::fs::File::open("data.db")?;let tracked = manager.track_resource(file_handle, "database_file").await;
// Use resourcetracked.get().read_to_string(&mut buffer)?;
// Resource automatically cleaned up when droppedBackground Cleanup
// Run cleanup loop in backgroundlet manager_clone = manager.clone();tokio::spawn(async move { manager_clone.run_cleanup_loop().await;});Configuration
Basic Configuration
use heliosdb_common::config_management::*;use std::path::PathBuf;
// Define your configuration#[derive(Debug, Clone, Serialize, Deserialize)]struct AppConfig { pub database_url: String, pub port: u16, pub max_connections: usize,}
// Implement Configurable traitimpl Configurable for AppConfig { fn validate(&self) -> Result<(), ConfigError> { ConfigValidator::validate_port(self.port)?; ConfigValidator::validate_not_empty(&self.database_url, "database_url")?; Ok(()) }
fn from_file(path: &PathBuf) -> Result<Self, ConfigError> { let contents = std::fs::read_to_string(path)?; toml::from_str(&contents).map_err(|e| ConfigError::Invalid(e.to_string())) }
fn merge(&mut self, other: Self) { *self = other; }}
// Create config managerlet initial_config = AppConfig { database_url: "postgres://localhost".to_string(), port: 5432, max_connections: 100,};
let config_mgmt = ConfigManagementConfig { environment: Environment::Production, config_file: PathBuf::from("/etc/app/config.toml"), enable_hot_reload: true, ..Default::default()};
let manager = ConfigManager::new(initial_config, config_mgmt).await?;
// Get current configlet config = manager.get_config().await;
// Reload configmanager.reload_config().await?;Vault Integration
// Enable Vaultlet config_mgmt = ConfigManagementConfig { enable_vault: true, vault_address: Some("https://vault.example.com".to_string()), vault_token: Some(std::env::var("VAULT_TOKEN")?), ..Default::default()};
let manager = ConfigManager::new(initial_config, config_mgmt).await?;
// Get secret from Vaultlet db_password = manager.get_secret("secret/database/password").await?;Hot Reload
// Watch for config changeslet manager_clone = manager.clone();tokio::spawn(async move { manager_clone.watch_for_changes().await;});Environment Variables
use heliosdb_common::config_management::EnvConfigProvider;
// Get environment variablelet db_host = EnvConfigProvider::get("DATABASE_HOST") .unwrap_or_else(|| "localhost".to_string());
// Get with defaultlet port = EnvConfigProvider::get_or_default("PORT", "5432");
// Parse typed valuelet max_conn: usize = EnvConfigProvider::get_parsed("MAX_CONNECTIONS") .unwrap_or(100);Logging
Basic Setup
use heliosdb_common::structured_logging::*;
// Configure logginglet config = LoggingConfig { level: "info".to_string(), enable_pii_redaction: true, structured: true, ..Default::default()};
let logger = StructuredLogger::new(config)?;logger.init()?;
// Use standard tracing macrostracing::info!("Application started");tracing::warn!("High memory usage detected");tracing::error!("Database connection failed");PII Redaction
// PII is automatically redacted in logslet user_email = "john.doe@example.com";tracing::info!("User logged in: {}", user_email);// Output: "User logged in: [REDACTED]"
// Custom PII patternslet config = LoggingConfig { pii_patterns: vec![ r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b".to_string(), r"\b\d{3}-\d{2}-\d{4}\b".to_string(), // SSN r"my-custom-pattern".to_string(), ], ..Default::default()};Log with Context
use tracing::{info_span, instrument};
// Create span for contextlet span = info_span!("process_request", request_id = %request_id);let _enter = span.enter();
tracing::info!("Processing request"); // Includes request_id in output
// Or use instrument macro#[instrument(skip(database))]async fn process_user(user_id: u64, database: &Database) { tracing::info!("Processing user"); // Includes user_id automatically}Troubleshooting Utilities
use heliosdb_common::structured_logging::TroubleshootingUtils;
// Dump state for debugginglet state = get_current_state();let dump = TroubleshootingUtils::dump_state(&state, "component_name");println!("{}", dump);
// Create diagnostic snapshotlet snapshot = TroubleshootingUtils::create_diagnostic_snapshot();println!("Hostname: {}", snapshot.get("hostname").unwrap());println!("Memory: {} KB", snapshot.get("free_memory_kb").unwrap());
// Format error chainlet error = some_operation()?;let chain = TroubleshootingUtils::format_error_chain(&error);tracing::error!("Operation failed:\n{}", chain);Health Checks
Basic Setup
use heliosdb_common::health_endpoints::*;use std::sync::Arc;
// Create health check systemlet health = Arc::new(HealthCheckSystem::new(HealthCheckConfig { check_interval: Duration::from_secs(10), check_timeout: Duration::from_secs(5), ..Default::default()}));
// Register componentshealth.register_component(Arc::new(StorageHealthCheck::new("storage".to_string()))).await;health.register_component(Arc::new(DatabaseHealthCheck::new("database".to_string()))).await;
// Mark application as startedhealth.mark_started().await;
// Serve health endpointslet health_clone = health.clone();tokio::spawn(async move { serve_health_endpoints(health_clone, "0.0.0.0:8080".parse().unwrap()).await.unwrap();});Custom Health Checks
use heliosdb_common::health_endpoints::HealthCheckable;
// Implement custom health checkstruct MyServiceHealthCheck { service: Arc<MyService>,}
impl HealthCheckable for MyServiceHealthCheck { async fn check_health(&self) -> Result<(), String> { // Check if service is healthy if self.service.is_connected().await { Ok(()) } else { Err("Service not connected".to_string()) } }
fn component_name(&self) -> &str { "my_service" }}
// Register custom checkhealth.register_component(Arc::new(MyServiceHealthCheck { service: my_service.clone(),})).await;Check Health Programmatically
// Check livenesslet liveness = health.liveness().await;if liveness.status == HealthStatus::Healthy { println!("Application is alive");}
// Check readinesslet readiness = health.readiness().await;if readiness.status == HealthStatus::Healthy { println!("Ready to serve traffic");}
// Check startuplet startup = health.startup().await;if startup.status == HealthStatus::Healthy { println!("Startup complete");}Background Health Monitoring
// Run health checks in backgroundlet health_clone = health.clone();tokio::spawn(async move { health_clone.run_health_checks().await;});Graceful Shutdown
Basic Setup
use heliosdb_common::graceful_shutdown::*;
// Create shutdown managerlet shutdown = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig { shutdown_timeout: Duration::from_secs(30), connection_drain_timeout: Duration::from_secs(15), inflight_timeout: Duration::from_secs(10), cleanup_timeout: Duration::from_secs(5),}));
// Register shutdown handlersshutdown.register_handler(Arc::new( ConnectionPoolShutdownHandler::new("main_pool".to_string()))).await;
// Listen for signalslet shutdown_clone = shutdown.clone();tokio::spawn(async move { shutdown_clone.listen_for_signals().await.unwrap();});Custom Shutdown Handlers
use heliosdb_common::graceful_shutdown::ShutdownHandler;
struct MyServiceShutdownHandler { service: Arc<MyService>,}
impl ShutdownHandler for MyServiceShutdownHandler { async fn on_shutdown_start(&self) -> Result<(), String> { tracing::info!("Starting service shutdown"); Ok(()) }
async fn drain_connections(&self) -> Result<(), String> { self.service.stop_accepting_connections().await; Ok(()) }
async fn complete_inflight(&self) -> Result<(), String> { self.service.wait_for_requests().await; Ok(()) }
async fn cleanup_resources(&self) -> Result<(), String> { self.service.cleanup().await; Ok(()) }}
// Register handlershutdown.register_handler(Arc::new(MyServiceShutdownHandler { service: my_service.clone(),})).await;Check Shutdown State
// Check if shutting downif shutdown.is_shutting_down().await { // Stop accepting new work return Ok(());}
// Get current phaselet phase = shutdown.get_shutdown_phase().await;match phase { ShutdownPhase::Running => println!("Normal operation"), ShutdownPhase::DrainConnections => println!("Draining connections"), ShutdownPhase::CompleteInFlight => println!("Completing requests"), ShutdownPhase::CleanupResources => println!("Cleaning up"), ShutdownPhase::Shutdown => println!("Shutdown complete"),}
// Get metricslet metrics = shutdown.get_metrics().await;println!("Connections drained: {}", metrics.connections_drained);println!("In-flight completed: {}", metrics.inflight_completed);Manual Shutdown
// Trigger shutdown programmaticallyshutdown.execute_shutdown().await?;Best Practices
1. Circuit Breaker
DO:
- Use circuit breakers for external service calls
- Configure failure thresholds based on SLAs
- Monitor circuit breaker state
- Use retry with exponential backoff
- Reset circuit only when safe
❌ DON’T:
- Use for internal function calls
- Set failure threshold too low (causes flapping)
- Ignore circuit open state
- Retry without backoff
2. Resource Management
DO:
- Set appropriate pool sizes for workload
- Configure memory limits per environment
- Enable leak detection in production
- Monitor resource metrics
- Use RAII patterns (guards)
❌ DON’T:
- Create unlimited resources
- Ignore resource warnings
- Disable cleanup
- Mix manual and automatic management
3. Configuration
DO:
- Validate configuration on load
- Use environment-specific configs
- Store secrets in Vault
- Enable hot reload for non-critical configs
- Use environment variables for overrides
❌ DON’T:
- Store secrets in config files
- Skip validation
- Reload critical configs without restart
- Use same config for all environments
4. Logging
DO:
- Enable PII redaction in production
- Use structured logging (JSON)
- Set appropriate log levels (INFO for prod)
- Add contextual information
- Rotate logs regularly
❌ DON’T:
- Log sensitive data directly
- Use DEBUG level in production
- Log excessively in hot paths
- Ignore log sampling for high volume
5. Health Checks
DO:
- Implement all three probes (liveness, readiness, startup)
- Set appropriate timeouts
- Check critical dependencies
- Return quickly (< 5s)
- Use background health monitoring
❌ DON’T:
- Perform expensive operations in health checks
- Skip dependency checks
- Ignore health check failures
- Set timeouts too short
6. Graceful Shutdown
DO:
- Implement all shutdown phases
- Set realistic timeouts
- Handle signals properly
- Wait for in-flight requests
- Clean up all resources
❌ DON’T:
- Force immediate shutdown
- Ignore in-flight requests
- Skip resource cleanup
- Set timeouts too short
Common Patterns
Pattern 1: Service with Full Hardening
struct MyService { circuit_breaker: Arc<CircuitBreaker>, resource_manager: Arc<ResourceManager>, health_system: Arc<HealthCheckSystem>, shutdown_manager: Arc<GracefulShutdownManager>,}
impl MyService { async fn new() -> Result<Self, Box<dyn std::error::Error>> { let circuit_breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig::default())); let resource_manager = Arc::new(ResourceManager::new(ResourceManagementConfig::default())); let health_system = Arc::new(HealthCheckSystem::new(HealthCheckConfig::default())); let shutdown_manager = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig::default()));
health_system.mark_started().await;
Ok(Self { circuit_breaker, resource_manager, health_system, shutdown_manager, }) }
async fn call_external_api(&self, request: Request) -> Result<Response, Error> { // Check if shutting down if self.shutdown_manager.is_shutting_down().await { return Err(Error::ShuttingDown); }
// Use circuit breaker with retry let result = self.circuit_breaker.call_with_retry(|| { external_api.call(request.clone()) }).await?;
Ok(result) }}Pattern 2: Database with Resource Management
struct Database { resource_manager: Arc<ResourceManager>,}
impl Database { async fn execute_query(&self, query: &str) -> Result<Vec<Row>, Error> { // Acquire connection from pool let conn = self.resource_manager.acquire_connection().await?;
// Allocate memory for results let mem = self.resource_manager.allocate_memory(estimated_size).await?;
// Execute query let rows = self.execute_with_connection(&conn, query).await?;
Ok(rows) // Connection and memory automatically released }}Pattern 3: HTTP Server with Health Checks
async fn start_server( health_system: Arc<HealthCheckSystem>, shutdown_manager: Arc<GracefulShutdownManager>,) -> Result<(), Box<dyn std::error::Error>> { // Serve health endpoints tokio::spawn(async move { serve_health_endpoints(health_system, "0.0.0.0:8080".parse().unwrap()).await });
// Serve main application let server = warp::serve(routes) .bind_with_graceful_shutdown( ([0, 0, 0, 0], 3000), async move { shutdown_manager.wait_for_shutdown_signal().await; } );
server.await;
Ok(())}Troubleshooting
Circuit Breaker Stuck Open
Problem: Circuit breaker won’t close
Solution:
// Check metricslet metrics = breaker.get_metrics().await;println!("Failures: {}, Successes: {}", metrics.failures, metrics.successes);
// Wait for open_duration to elapse// Circuit will transition to half-open and test
// If necessary, reset manually (use with caution)breaker.reset().await;Resource Leaks Detected
Problem: Leak detection reports leaks
Solution:
// Get resource metricslet metrics = manager.get_metrics().await;println!("Leaks detected: {}", metrics.resource_leaks_detected);
// Enable detailed trackinglet config = ResourceManagementConfig { enable_leak_detection: true, leak_detection_interval: Duration::from_secs(60), ..Default::default()};
// Check specific resources// Ensure all resources are properly droppedHealth Checks Failing
Problem: Health checks timeout or fail
Solution:
// Check timeout configurationlet config = HealthCheckConfig { check_timeout: Duration::from_secs(10), // Increase if needed ..Default::default()};
// Check individual componentlet readiness = health_system.readiness().await;for (name, check) in &readiness.checks { if check.status != HealthStatus::Healthy { println!("Component {} failed: {:?}", name, check.message); }}Graceful Shutdown Timeout
Problem: Shutdown takes too long or times out
Solution:
// Increase timeoutslet config = GracefulShutdownConfig { shutdown_timeout: Duration::from_secs(60), // Increase total connection_drain_timeout: Duration::from_secs(30), ..Default::default()};
// Check what's taking timelet metrics = shutdown_manager.get_metrics().await;println!("Phase: {}", metrics.phase);println!("Active connections: {}", metrics.connections_active);Additional Resources
- Production Hardening Completion Report
- Circuit Breaker Source
- Resource Management Source
- Health Endpoints Source
- Graceful Shutdown Source
- Integration Example
Guide Version: 1.0.0 Last Updated: November 24, 2025 Maintainers: HeliosDB Core Team