HeliosDB Advanced ML Integration Guide

Version: 1.0 Last Updated: 2025-11-30 Status: Complete

Quick Start

-- Train ML model in database
SELECT * FROM ml_train('anomaly_detection',
  SELECT temperature, humidity, pressure FROM sensor_data WHERE labeled = true
);

-- Run inference on data
SELECT id, temperature, ml_predict('anomaly_detection', temperature, humidity) as anomaly_score
FROM sensor_data
WHERE anomaly_score > 0.8;

1. In-Database ML Training

Training Models with SQL

-- Training command format
SELECT ml_train(
  model_name,
  training_query,
  model_type,
  parameters
);

-- Example: Time series forecasting
SELECT ml_train(
  'sales_forecast',
  'SELECT day, revenue, promotion_running FROM daily_sales WHERE day < NOW() - INTERVAL 7',
  'time_series',
  jsonb_build_object(
    'algorithm', 'ARIMA',
    'p', 1, 'd', 1, 'q', 1,
    'forecast_horizon', 30
  )
);

Supported ML Algorithms

Algorithm	Type	Use Case
XGBoost	Classification/Regression	General purpose ML
LightGBM	Classification/Regression	Fast gradient boosting
Random Forest	Classification/Regression	Robust ensemble
K-Means	Clustering	Customer segmentation
Isolation Forest	Anomaly Detection	Outlier detection
ARIMA	Time Series	Forecasting
LSTM	Time Series	Sequence prediction
Gaussian Mixture	Clustering	Probabilistic clustering

Model Configuration

SELECT ml_train(
  'customer_churn_model',
  'SELECT * FROM customers WHERE active = true',
  'xgboost',
  jsonb_build_object(
    'objective', 'binary:logistic',
    'max_depth', 6,
    'learning_rate', 0.1,
    'n_estimators', 100,
    'validation_split', 0.2
  )
);

2. Federated Learning

Multi-Organization Training

-- Worker node trains locally
SELECT ml_federated_train(
  federation_id='sales_consortium',
  model_name='revenue_predictor',
  local_data='SELECT amount, date FROM local_orders',
  update_frequency='1 day'
);

-- Aggregator node combines updates
SELECT ml_federated_aggregate(
  federation_id='sales_consortium',
  aggregation_method='fedavg',
  quality_threshold=0.95
);

3. Model Registry & Versioning

Model Management

-- Register model
SELECT ml_register_model(
  model_name='fraud_detector',
  model_version='1.0.0',
  accuracy=0.98,
  production_ready=true
);

-- List models
SELECT * FROM ml_model_registry
WHERE production_ready = true
ORDER BY created_date DESC;

-- Compare versions
SELECT ml_compare_models('fraud_detector', '1.0.0', '1.1.0');

4. AutoML & Auto-Tuning

Automatic Model Selection

-- Let AutoML find best model
SELECT ml_automl(
  problem_type='classification',
  training_data='SELECT * FROM dataset',
  target_variable='fraud_flag',
  time_limit=3600,  -- 1 hour search
  metric='auc'
) as best_model;

-- AutoML tunes database for you
SELECT ml_auto_tune_database(
  optimization_metric='query_latency',
  target_p99_ms=100
);

5. Inference & Predictions

Making Predictions

-- Single record prediction
SELECT ml_predict('fraud_detector',
  transaction_amount => 150.00,
  merchant_risk_score => 0.8,
  customer_age => 35
) as fraud_probability;

-- Batch predictions
SELECT
  id,
  transaction_data,
  ml_predict_batch('fraud_detector', transaction_data) as fraud_score
FROM transactions
WHERE created_date >= NOW() - INTERVAL '1 day';

6. Feature Engineering

Automatic Feature Creation

-- Auto-generate features
SELECT ml_auto_feature_engineer(
  table_name='customers',
  target='churn_flag',
  feature_types=['numeric', 'categorical', 'temporal', 'relational']
);

-- Custom features
SELECT
  customer_id,
  customer_lifetime_value / NULLIF(days_as_customer, 0) as ltv_daily_rate,
  ROW_NUMBER() OVER (PARTITION BY region ORDER BY purchase_amount DESC) as rank_in_region
FROM customers;

7. Model Monitoring & Evaluation

Track Model Performance

-- Model evaluation metrics
SELECT * FROM ml_model_metrics('fraud_detector')
WHERE evaluation_date >= NOW() - INTERVAL '30 days'
ORDER BY evaluation_date DESC;

-- Detect model drift
SELECT ml_check_model_drift(
  model_name='fraud_detector',
  monitoring_window='7 days',
  drift_threshold=0.95
) as is_drift_detected;

8. Privacy-Preserving ML

Federated Learning with Privacy

-- Differential privacy on training
SELECT ml_train_with_privacy(
  model_name='sensitive_model',
  training_data=training_query,
  privacy_budget=0.1,  -- Epsilon value
  noise_mechanism='laplace'
);

-- Homomorphic encryption for inference
SELECT ml_predict_encrypted(
  model_name='fraud_detector',
  encrypted_features=encrypted_data,
  output_encryption=true
);

9. Model Explainability

Understand Predictions

-- Feature importance
SELECT feature, importance_score
FROM ml_explain_feature_importance('fraud_detector')
ORDER BY importance_score DESC
LIMIT 10;

-- SHAP values for predictions
SELECT
  feature,
  shap_value,
  contribution_to_prediction
FROM ml_explain_prediction(
  model='fraud_detector',
  input_data=transaction_features
)
ORDER BY ABS(shap_value) DESC;

10. Integration Examples

Real-Time Scoring Pipeline

-- Create materialized view with predictions
CREATE MATERIALIZED VIEW customer_risk_scores AS
SELECT
  c.customer_id,
  c.customer_name,
  ml_predict('fraud_detector',
    c.avg_transaction_amount,
    c.account_age_days,
    c.failed_login_attempts
  ) as fraud_risk_score,
  CASE
    WHEN ml_predict(...) > 0.8 THEN 'HIGH'
    WHEN ml_predict(...) > 0.5 THEN 'MEDIUM'
    ELSE 'LOW'
  END as risk_level
FROM customers c;

-- Refresh daily
REFRESH MATERIALIZED VIEW customer_risk_scores;

Anomaly Detection Pipeline

-- Insert predictions with models
INSERT INTO sensor_anomalies
SELECT
  id,
  timestamp,
  ml_predict('anomaly_model', temperature, humidity, pressure) as anomaly_score,
  CURRENT_TIMESTAMP as detected_at
FROM sensor_readings
WHERE ml_predict('anomaly_model', temperature, humidity, pressure) > 0.85;

Best Practices

Train on representative data - Ensure training set matches production data
Monitor for drift - Check model performance continuously
Version models - Track all model versions and metrics
Use validation sets - Prevent overfitting with proper validation
Document assumptions - Record feature engineering decisions
Plan for retraining - Update models regularly as patterns change

Summary

Advanced ML in HeliosDB enables:

SQL-native ML model training
Federated learning across organizations
AutoML for automatic model selection
Privacy-preserving techniques
Real-time inference at scale

Related Documentation: