Skip to content

Production Hardening Developer Guide

Production Hardening Developer Guide

Quick Reference: How to use HeliosDB production hardening features


Table of Contents

  1. Quick Start
  2. Circuit Breaker
  3. Resource Management
  4. Configuration
  5. Logging
  6. Health Checks
  7. Graceful Shutdown
  8. Best Practices

Quick Start

Add Dependencies

[dependencies]
heliosdb-common = { path = "../heliosdb-common" }
tokio = { version = "1.40", features = ["full"] }
tracing = "0.1"

Basic Setup

use heliosdb_common::{
production_hardening::*,
circuit_breaker::*,
resource_management::*,
health_endpoints::*,
graceful_shutdown::*,
};
use std::sync::Arc;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// 1. Initialize production config
let config = ProductionConfig::default();
let hardening = Arc::new(ProductionHardeningManager::new(config));
// 2. Set up circuit breaker
let breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig::default()));
// 3. Initialize resource manager
let resources = Arc::new(ResourceManager::new(ResourceManagementConfig::default()));
// 4. Set up health checks
let health = Arc::new(HealthCheckSystem::new(HealthCheckConfig::default()));
health.mark_started().await;
// 5. Configure graceful shutdown
let shutdown = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig::default()));
// Your application logic here
Ok(())
}

Circuit Breaker

Basic Usage

use heliosdb_common::circuit_breaker::*;
// Create circuit breaker
let breaker = CircuitBreaker::new(CircuitBreakerConfig {
failure_threshold: 0.5,
window_duration: Duration::from_secs(60),
open_duration: Duration::from_secs(30),
max_retry_attempts: 3,
initial_backoff: Duration::from_millis(100),
max_backoff: Duration::from_secs(30),
backoff_multiplier: 2.0,
});
// Execute with circuit breaker
let result = breaker.call(|| {
database.execute_query(query)
}).await;
match result {
Ok(data) => println!("Success: {:?}", data),
Err(CircuitBreakerError::Open) => println!("Circuit is open, fast-failing"),
Err(e) => println!("Error: {}", e),
}

With Automatic Retry

// Execute with retry on failure
let result = breaker.call_with_retry(|| {
external_api.call()
}).await?;

Check Circuit State

// Check current state
let state = breaker.get_state().await;
match state {
CircuitState::Closed => println!("Normal operation"),
CircuitState::Open => println!("Circuit is open"),
CircuitState::HalfOpen => println!("Testing recovery"),
}
// Get metrics
let metrics = breaker.get_metrics().await;
println!("Failure rate: {:.1}%", metrics.failure_rate * 100.0);
println!("Failures: {}, Successes: {}", metrics.failures, metrics.successes);

Reset Circuit

// Manually reset circuit (use with caution)
breaker.reset().await;

Resource Management

Connection Pooling

use heliosdb_common::resource_management::*;
// Create resource manager
let manager = ResourceManager::new(ResourceManagementConfig {
connection_pool_size: 100,
connection_idle_timeout: Duration::from_secs(300),
connection_max_lifetime: Duration::from_secs(3600),
..Default::default()
});
// Acquire connection
let conn = manager.acquire_connection().await?;
// Connection automatically returned to pool when dropped
// Get pool statistics
let metrics = manager.get_metrics().await;
println!("Active: {}, Idle: {}, Utilization: {:.1}%",
metrics.active_connections,
metrics.idle_connections,
metrics.connection_pool_utilization * 100.0
);

Memory Management

// Allocate memory with tracking
let memory = manager.allocate_memory(1024 * 1024).await?; // 1 MB
// Memory automatically freed when guard dropped
// Check memory usage
let metrics = manager.get_metrics().await;
println!("Memory: {} / {} bytes ({:.1}%)",
metrics.memory_used_bytes,
metrics.memory_limit_bytes,
metrics.memory_usage_percent
);

Resource Tracking

// Track any resource
let file_handle = std::fs::File::open("data.db")?;
let tracked = manager.track_resource(file_handle, "database_file").await;
// Use resource
tracked.get().read_to_string(&mut buffer)?;
// Resource automatically cleaned up when dropped

Background Cleanup

// Run cleanup loop in background
let manager_clone = manager.clone();
tokio::spawn(async move {
manager_clone.run_cleanup_loop().await;
});

Configuration

Basic Configuration

use heliosdb_common::config_management::*;
use std::path::PathBuf;
// Define your configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AppConfig {
pub database_url: String,
pub port: u16,
pub max_connections: usize,
}
// Implement Configurable trait
impl Configurable for AppConfig {
fn validate(&self) -> Result<(), ConfigError> {
ConfigValidator::validate_port(self.port)?;
ConfigValidator::validate_not_empty(&self.database_url, "database_url")?;
Ok(())
}
fn from_file(path: &PathBuf) -> Result<Self, ConfigError> {
let contents = std::fs::read_to_string(path)?;
toml::from_str(&contents).map_err(|e| ConfigError::Invalid(e.to_string()))
}
fn merge(&mut self, other: Self) {
*self = other;
}
}
// Create config manager
let initial_config = AppConfig {
database_url: "postgres://localhost".to_string(),
port: 5432,
max_connections: 100,
};
let config_mgmt = ConfigManagementConfig {
environment: Environment::Production,
config_file: PathBuf::from("/etc/app/config.toml"),
enable_hot_reload: true,
..Default::default()
};
let manager = ConfigManager::new(initial_config, config_mgmt).await?;
// Get current config
let config = manager.get_config().await;
// Reload config
manager.reload_config().await?;

Vault Integration

// Enable Vault
let config_mgmt = ConfigManagementConfig {
enable_vault: true,
vault_address: Some("https://vault.example.com".to_string()),
vault_token: Some(std::env::var("VAULT_TOKEN")?),
..Default::default()
};
let manager = ConfigManager::new(initial_config, config_mgmt).await?;
// Get secret from Vault
let db_password = manager.get_secret("secret/database/password").await?;

Hot Reload

// Watch for config changes
let manager_clone = manager.clone();
tokio::spawn(async move {
manager_clone.watch_for_changes().await;
});

Environment Variables

use heliosdb_common::config_management::EnvConfigProvider;
// Get environment variable
let db_host = EnvConfigProvider::get("DATABASE_HOST")
.unwrap_or_else(|| "localhost".to_string());
// Get with default
let port = EnvConfigProvider::get_or_default("PORT", "5432");
// Parse typed value
let max_conn: usize = EnvConfigProvider::get_parsed("MAX_CONNECTIONS")
.unwrap_or(100);

Logging

Basic Setup

use heliosdb_common::structured_logging::*;
// Configure logging
let config = LoggingConfig {
level: "info".to_string(),
enable_pii_redaction: true,
structured: true,
..Default::default()
};
let logger = StructuredLogger::new(config)?;
logger.init()?;
// Use standard tracing macros
tracing::info!("Application started");
tracing::warn!("High memory usage detected");
tracing::error!("Database connection failed");

PII Redaction

// PII is automatically redacted in logs
let user_email = "john.doe@example.com";
tracing::info!("User logged in: {}", user_email);
// Output: "User logged in: [REDACTED]"
// Custom PII patterns
let config = LoggingConfig {
pii_patterns: vec![
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b".to_string(),
r"\b\d{3}-\d{2}-\d{4}\b".to_string(), // SSN
r"my-custom-pattern".to_string(),
],
..Default::default()
};

Log with Context

use tracing::{info_span, instrument};
// Create span for context
let span = info_span!("process_request", request_id = %request_id);
let _enter = span.enter();
tracing::info!("Processing request"); // Includes request_id in output
// Or use instrument macro
#[instrument(skip(database))]
async fn process_user(user_id: u64, database: &Database) {
tracing::info!("Processing user"); // Includes user_id automatically
}

Troubleshooting Utilities

use heliosdb_common::structured_logging::TroubleshootingUtils;
// Dump state for debugging
let state = get_current_state();
let dump = TroubleshootingUtils::dump_state(&state, "component_name");
println!("{}", dump);
// Create diagnostic snapshot
let snapshot = TroubleshootingUtils::create_diagnostic_snapshot();
println!("Hostname: {}", snapshot.get("hostname").unwrap());
println!("Memory: {} KB", snapshot.get("free_memory_kb").unwrap());
// Format error chain
let error = some_operation()?;
let chain = TroubleshootingUtils::format_error_chain(&error);
tracing::error!("Operation failed:\n{}", chain);

Health Checks

Basic Setup

use heliosdb_common::health_endpoints::*;
use std::sync::Arc;
// Create health check system
let health = Arc::new(HealthCheckSystem::new(HealthCheckConfig {
check_interval: Duration::from_secs(10),
check_timeout: Duration::from_secs(5),
..Default::default()
}));
// Register components
health.register_component(Arc::new(StorageHealthCheck::new("storage".to_string()))).await;
health.register_component(Arc::new(DatabaseHealthCheck::new("database".to_string()))).await;
// Mark application as started
health.mark_started().await;
// Serve health endpoints
let health_clone = health.clone();
tokio::spawn(async move {
serve_health_endpoints(health_clone, "0.0.0.0:8080".parse().unwrap()).await.unwrap();
});

Custom Health Checks

use heliosdb_common::health_endpoints::HealthCheckable;
// Implement custom health check
struct MyServiceHealthCheck {
service: Arc<MyService>,
}
impl HealthCheckable for MyServiceHealthCheck {
async fn check_health(&self) -> Result<(), String> {
// Check if service is healthy
if self.service.is_connected().await {
Ok(())
} else {
Err("Service not connected".to_string())
}
}
fn component_name(&self) -> &str {
"my_service"
}
}
// Register custom check
health.register_component(Arc::new(MyServiceHealthCheck {
service: my_service.clone(),
})).await;

Check Health Programmatically

// Check liveness
let liveness = health.liveness().await;
if liveness.status == HealthStatus::Healthy {
println!("Application is alive");
}
// Check readiness
let readiness = health.readiness().await;
if readiness.status == HealthStatus::Healthy {
println!("Ready to serve traffic");
}
// Check startup
let startup = health.startup().await;
if startup.status == HealthStatus::Healthy {
println!("Startup complete");
}

Background Health Monitoring

// Run health checks in background
let health_clone = health.clone();
tokio::spawn(async move {
health_clone.run_health_checks().await;
});

Graceful Shutdown

Basic Setup

use heliosdb_common::graceful_shutdown::*;
// Create shutdown manager
let shutdown = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig {
shutdown_timeout: Duration::from_secs(30),
connection_drain_timeout: Duration::from_secs(15),
inflight_timeout: Duration::from_secs(10),
cleanup_timeout: Duration::from_secs(5),
}));
// Register shutdown handlers
shutdown.register_handler(Arc::new(
ConnectionPoolShutdownHandler::new("main_pool".to_string())
)).await;
// Listen for signals
let shutdown_clone = shutdown.clone();
tokio::spawn(async move {
shutdown_clone.listen_for_signals().await.unwrap();
});

Custom Shutdown Handlers

use heliosdb_common::graceful_shutdown::ShutdownHandler;
struct MyServiceShutdownHandler {
service: Arc<MyService>,
}
impl ShutdownHandler for MyServiceShutdownHandler {
async fn on_shutdown_start(&self) -> Result<(), String> {
tracing::info!("Starting service shutdown");
Ok(())
}
async fn drain_connections(&self) -> Result<(), String> {
self.service.stop_accepting_connections().await;
Ok(())
}
async fn complete_inflight(&self) -> Result<(), String> {
self.service.wait_for_requests().await;
Ok(())
}
async fn cleanup_resources(&self) -> Result<(), String> {
self.service.cleanup().await;
Ok(())
}
}
// Register handler
shutdown.register_handler(Arc::new(MyServiceShutdownHandler {
service: my_service.clone(),
})).await;

Check Shutdown State

// Check if shutting down
if shutdown.is_shutting_down().await {
// Stop accepting new work
return Ok(());
}
// Get current phase
let phase = shutdown.get_shutdown_phase().await;
match phase {
ShutdownPhase::Running => println!("Normal operation"),
ShutdownPhase::DrainConnections => println!("Draining connections"),
ShutdownPhase::CompleteInFlight => println!("Completing requests"),
ShutdownPhase::CleanupResources => println!("Cleaning up"),
ShutdownPhase::Shutdown => println!("Shutdown complete"),
}
// Get metrics
let metrics = shutdown.get_metrics().await;
println!("Connections drained: {}", metrics.connections_drained);
println!("In-flight completed: {}", metrics.inflight_completed);

Manual Shutdown

// Trigger shutdown programmatically
shutdown.execute_shutdown().await?;

Best Practices

1. Circuit Breaker

DO:

  • Use circuit breakers for external service calls
  • Configure failure thresholds based on SLAs
  • Monitor circuit breaker state
  • Use retry with exponential backoff
  • Reset circuit only when safe

DON’T:

  • Use for internal function calls
  • Set failure threshold too low (causes flapping)
  • Ignore circuit open state
  • Retry without backoff

2. Resource Management

DO:

  • Set appropriate pool sizes for workload
  • Configure memory limits per environment
  • Enable leak detection in production
  • Monitor resource metrics
  • Use RAII patterns (guards)

DON’T:

  • Create unlimited resources
  • Ignore resource warnings
  • Disable cleanup
  • Mix manual and automatic management

3. Configuration

DO:

  • Validate configuration on load
  • Use environment-specific configs
  • Store secrets in Vault
  • Enable hot reload for non-critical configs
  • Use environment variables for overrides

DON’T:

  • Store secrets in config files
  • Skip validation
  • Reload critical configs without restart
  • Use same config for all environments

4. Logging

DO:

  • Enable PII redaction in production
  • Use structured logging (JSON)
  • Set appropriate log levels (INFO for prod)
  • Add contextual information
  • Rotate logs regularly

DON’T:

  • Log sensitive data directly
  • Use DEBUG level in production
  • Log excessively in hot paths
  • Ignore log sampling for high volume

5. Health Checks

DO:

  • Implement all three probes (liveness, readiness, startup)
  • Set appropriate timeouts
  • Check critical dependencies
  • Return quickly (< 5s)
  • Use background health monitoring

DON’T:

  • Perform expensive operations in health checks
  • Skip dependency checks
  • Ignore health check failures
  • Set timeouts too short

6. Graceful Shutdown

DO:

  • Implement all shutdown phases
  • Set realistic timeouts
  • Handle signals properly
  • Wait for in-flight requests
  • Clean up all resources

DON’T:

  • Force immediate shutdown
  • Ignore in-flight requests
  • Skip resource cleanup
  • Set timeouts too short

Common Patterns

Pattern 1: Service with Full Hardening

struct MyService {
circuit_breaker: Arc<CircuitBreaker>,
resource_manager: Arc<ResourceManager>,
health_system: Arc<HealthCheckSystem>,
shutdown_manager: Arc<GracefulShutdownManager>,
}
impl MyService {
async fn new() -> Result<Self, Box<dyn std::error::Error>> {
let circuit_breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig::default()));
let resource_manager = Arc::new(ResourceManager::new(ResourceManagementConfig::default()));
let health_system = Arc::new(HealthCheckSystem::new(HealthCheckConfig::default()));
let shutdown_manager = Arc::new(GracefulShutdownManager::new(GracefulShutdownConfig::default()));
health_system.mark_started().await;
Ok(Self {
circuit_breaker,
resource_manager,
health_system,
shutdown_manager,
})
}
async fn call_external_api(&self, request: Request) -> Result<Response, Error> {
// Check if shutting down
if self.shutdown_manager.is_shutting_down().await {
return Err(Error::ShuttingDown);
}
// Use circuit breaker with retry
let result = self.circuit_breaker.call_with_retry(|| {
external_api.call(request.clone())
}).await?;
Ok(result)
}
}

Pattern 2: Database with Resource Management

struct Database {
resource_manager: Arc<ResourceManager>,
}
impl Database {
async fn execute_query(&self, query: &str) -> Result<Vec<Row>, Error> {
// Acquire connection from pool
let conn = self.resource_manager.acquire_connection().await?;
// Allocate memory for results
let mem = self.resource_manager.allocate_memory(estimated_size).await?;
// Execute query
let rows = self.execute_with_connection(&conn, query).await?;
Ok(rows)
// Connection and memory automatically released
}
}

Pattern 3: HTTP Server with Health Checks

async fn start_server(
health_system: Arc<HealthCheckSystem>,
shutdown_manager: Arc<GracefulShutdownManager>,
) -> Result<(), Box<dyn std::error::Error>> {
// Serve health endpoints
tokio::spawn(async move {
serve_health_endpoints(health_system, "0.0.0.0:8080".parse().unwrap()).await
});
// Serve main application
let server = warp::serve(routes)
.bind_with_graceful_shutdown(
([0, 0, 0, 0], 3000),
async move {
shutdown_manager.wait_for_shutdown_signal().await;
}
);
server.await;
Ok(())
}

Troubleshooting

Circuit Breaker Stuck Open

Problem: Circuit breaker won’t close

Solution:

// Check metrics
let metrics = breaker.get_metrics().await;
println!("Failures: {}, Successes: {}", metrics.failures, metrics.successes);
// Wait for open_duration to elapse
// Circuit will transition to half-open and test
// If necessary, reset manually (use with caution)
breaker.reset().await;

Resource Leaks Detected

Problem: Leak detection reports leaks

Solution:

// Get resource metrics
let metrics = manager.get_metrics().await;
println!("Leaks detected: {}", metrics.resource_leaks_detected);
// Enable detailed tracking
let config = ResourceManagementConfig {
enable_leak_detection: true,
leak_detection_interval: Duration::from_secs(60),
..Default::default()
};
// Check specific resources
// Ensure all resources are properly dropped

Health Checks Failing

Problem: Health checks timeout or fail

Solution:

// Check timeout configuration
let config = HealthCheckConfig {
check_timeout: Duration::from_secs(10), // Increase if needed
..Default::default()
};
// Check individual component
let readiness = health_system.readiness().await;
for (name, check) in &readiness.checks {
if check.status != HealthStatus::Healthy {
println!("Component {} failed: {:?}", name, check.message);
}
}

Graceful Shutdown Timeout

Problem: Shutdown takes too long or times out

Solution:

// Increase timeouts
let config = GracefulShutdownConfig {
shutdown_timeout: Duration::from_secs(60), // Increase total
connection_drain_timeout: Duration::from_secs(30),
..Default::default()
};
// Check what's taking time
let metrics = shutdown_manager.get_metrics().await;
println!("Phase: {}", metrics.phase);
println!("Active connections: {}", metrics.connections_active);

Additional Resources


Guide Version: 1.0.0 Last Updated: November 24, 2025 Maintainers: HeliosDB Core Team