Skip to content

HeliosDB High Availability Operational Procedures

HeliosDB High Availability Operational Procedures

Version: 1.0 Last Updated: 2025-11-30


Cluster Setup

-- Create HA cluster with 3 nodes
SELECT create_ha_cluster(
cluster_name => 'production-cluster',
nodes => ARRAY['node1:5432', 'node2:5432', 'node3:5432'],
replication_mode => 'synchronous',
quorum => 2
);
-- Verify cluster status
SELECT * FROM cluster_status;

Monitoring

-- Real-time health check
SELECT
node_name,
status,
lag_bytes,
uptime_seconds,
cpu_usage_pct,
memory_usage_pct
FROM cluster_health;
-- Alert setup
CREATE ALERT node_down AS
WHEN ANY NODE status != 'HEALTHY'
THEN NOTIFY 'ops-team' SEVERITY HIGH;

Failover Procedures

Automatic Failover

-- Enable automatic failover
ALTER CLUSTER SET (
auto_failover = true,
failover_timeout = 30,
failover_min_replicas = 1
);

Manual Failover

-- Initiate switchover
SELECT manual_failover_to('node2');
-- Monitor progress
SELECT * FROM failover_progress;

Scaling

-- Add node
SELECT add_cluster_node('node4:5432');
-- Remove node
SELECT remove_cluster_node('node3');
-- Rebalance
SELECT rebalance_cluster();

Recovery

-- Rebuild failed node
SELECT rebuild_node('node3');
-- Reset replica
SELECT reset_replica('node2');

Best Practices

  1. Use synchronous replication for critical data
  2. Monitor node health continuously
  3. Test failover monthly
  4. Keep replicas in sync
  5. Set up automated alerts

Related Documentation: