HeliosDB High Availability Operational Procedures

Version: 1.0

Cluster Setup

-- Create HA cluster with 3 nodes
SELECT create_ha_cluster(
  cluster_name => 'production-cluster',
  nodes => ARRAY['node1:5432', 'node2:5432', 'node3:5432'],
  replication_mode => 'synchronous',
  quorum => 2
);

-- Verify cluster status
SELECT * FROM cluster_status;

Monitoring

-- Real-time health check
SELECT
  node_name,
  status,
  lag_bytes,
  uptime_seconds,
  cpu_usage_pct,
  memory_usage_pct
FROM cluster_health;

-- Alert setup
CREATE ALERT node_down AS
  WHEN ANY NODE status != 'HEALTHY'
  THEN NOTIFY 'ops-team' SEVERITY HIGH;

Failover Procedures

Automatic Failover

-- Enable automatic failover
ALTER CLUSTER SET (
  auto_failover = true,
  failover_timeout = 30,
  failover_min_replicas = 1
);

Manual Failover

-- Initiate switchover
SELECT manual_failover_to('node2');

-- Monitor progress
SELECT * FROM failover_progress;

Scaling

-- Add node
SELECT add_cluster_node('node4:5432');

-- Remove node
SELECT remove_cluster_node('node3');

-- Rebalance
SELECT rebalance_cluster();

Recovery

-- Rebuild failed node
SELECT rebuild_node('node3');

-- Reset replica
SELECT reset_replica('node2');

Best Practices

Use synchronous replication for critical data
Monitor node health continuously
Test failover monthly
Keep replicas in sync
Set up automated alerts

Related Documentation: