Skip to content

HeliosDB Advanced ML Integration Guide

HeliosDB Advanced ML Integration Guide

Version: 1.0 Last Updated: 2025-11-30 Status: Complete


Quick Start

-- Train ML model in database
SELECT * FROM ml_train('anomaly_detection',
SELECT temperature, humidity, pressure FROM sensor_data WHERE labeled = true
);
-- Run inference on data
SELECT id, temperature, ml_predict('anomaly_detection', temperature, humidity) as anomaly_score
FROM sensor_data
WHERE anomaly_score > 0.8;

1. In-Database ML Training

Training Models with SQL

-- Training command format
SELECT ml_train(
model_name,
training_query,
model_type,
parameters
);
-- Example: Time series forecasting
SELECT ml_train(
'sales_forecast',
'SELECT day, revenue, promotion_running FROM daily_sales WHERE day < NOW() - INTERVAL 7',
'time_series',
jsonb_build_object(
'algorithm', 'ARIMA',
'p', 1, 'd', 1, 'q', 1,
'forecast_horizon', 30
)
);

Supported ML Algorithms

AlgorithmTypeUse Case
XGBoostClassification/RegressionGeneral purpose ML
LightGBMClassification/RegressionFast gradient boosting
Random ForestClassification/RegressionRobust ensemble
K-MeansClusteringCustomer segmentation
Isolation ForestAnomaly DetectionOutlier detection
ARIMATime SeriesForecasting
LSTMTime SeriesSequence prediction
Gaussian MixtureClusteringProbabilistic clustering

Model Configuration

SELECT ml_train(
'customer_churn_model',
'SELECT * FROM customers WHERE active = true',
'xgboost',
jsonb_build_object(
'objective', 'binary:logistic',
'max_depth', 6,
'learning_rate', 0.1,
'n_estimators', 100,
'validation_split', 0.2
)
);

2. Federated Learning

Multi-Organization Training

-- Worker node trains locally
SELECT ml_federated_train(
federation_id='sales_consortium',
model_name='revenue_predictor',
local_data='SELECT amount, date FROM local_orders',
update_frequency='1 day'
);
-- Aggregator node combines updates
SELECT ml_federated_aggregate(
federation_id='sales_consortium',
aggregation_method='fedavg',
quality_threshold=0.95
);

3. Model Registry & Versioning

Model Management

-- Register model
SELECT ml_register_model(
model_name='fraud_detector',
model_version='1.0.0',
accuracy=0.98,
production_ready=true
);
-- List models
SELECT * FROM ml_model_registry
WHERE production_ready = true
ORDER BY created_date DESC;
-- Compare versions
SELECT ml_compare_models('fraud_detector', '1.0.0', '1.1.0');

4. AutoML & Auto-Tuning

Automatic Model Selection

-- Let AutoML find best model
SELECT ml_automl(
problem_type='classification',
training_data='SELECT * FROM dataset',
target_variable='fraud_flag',
time_limit=3600, -- 1 hour search
metric='auc'
) as best_model;
-- AutoML tunes database for you
SELECT ml_auto_tune_database(
optimization_metric='query_latency',
target_p99_ms=100
);

5. Inference & Predictions

Making Predictions

-- Single record prediction
SELECT ml_predict('fraud_detector',
transaction_amount => 150.00,
merchant_risk_score => 0.8,
customer_age => 35
) as fraud_probability;
-- Batch predictions
SELECT
id,
transaction_data,
ml_predict_batch('fraud_detector', transaction_data) as fraud_score
FROM transactions
WHERE created_date >= NOW() - INTERVAL '1 day';

6. Feature Engineering

Automatic Feature Creation

-- Auto-generate features
SELECT ml_auto_feature_engineer(
table_name='customers',
target='churn_flag',
feature_types=['numeric', 'categorical', 'temporal', 'relational']
);
-- Custom features
SELECT
customer_id,
customer_lifetime_value / NULLIF(days_as_customer, 0) as ltv_daily_rate,
ROW_NUMBER() OVER (PARTITION BY region ORDER BY purchase_amount DESC) as rank_in_region
FROM customers;

7. Model Monitoring & Evaluation

Track Model Performance

-- Model evaluation metrics
SELECT * FROM ml_model_metrics('fraud_detector')
WHERE evaluation_date >= NOW() - INTERVAL '30 days'
ORDER BY evaluation_date DESC;
-- Detect model drift
SELECT ml_check_model_drift(
model_name='fraud_detector',
monitoring_window='7 days',
drift_threshold=0.95
) as is_drift_detected;

8. Privacy-Preserving ML

Federated Learning with Privacy

-- Differential privacy on training
SELECT ml_train_with_privacy(
model_name='sensitive_model',
training_data=training_query,
privacy_budget=0.1, -- Epsilon value
noise_mechanism='laplace'
);
-- Homomorphic encryption for inference
SELECT ml_predict_encrypted(
model_name='fraud_detector',
encrypted_features=encrypted_data,
output_encryption=true
);

9. Model Explainability

Understand Predictions

-- Feature importance
SELECT feature, importance_score
FROM ml_explain_feature_importance('fraud_detector')
ORDER BY importance_score DESC
LIMIT 10;
-- SHAP values for predictions
SELECT
feature,
shap_value,
contribution_to_prediction
FROM ml_explain_prediction(
model='fraud_detector',
input_data=transaction_features
)
ORDER BY ABS(shap_value) DESC;

10. Integration Examples

Real-Time Scoring Pipeline

-- Create materialized view with predictions
CREATE MATERIALIZED VIEW customer_risk_scores AS
SELECT
c.customer_id,
c.customer_name,
ml_predict('fraud_detector',
c.avg_transaction_amount,
c.account_age_days,
c.failed_login_attempts
) as fraud_risk_score,
CASE
WHEN ml_predict(...) > 0.8 THEN 'HIGH'
WHEN ml_predict(...) > 0.5 THEN 'MEDIUM'
ELSE 'LOW'
END as risk_level
FROM customers c;
-- Refresh daily
REFRESH MATERIALIZED VIEW customer_risk_scores;

Anomaly Detection Pipeline

-- Insert predictions with models
INSERT INTO sensor_anomalies
SELECT
id,
timestamp,
ml_predict('anomaly_model', temperature, humidity, pressure) as anomaly_score,
CURRENT_TIMESTAMP as detected_at
FROM sensor_readings
WHERE ml_predict('anomaly_model', temperature, humidity, pressure) > 0.85;

Best Practices

  1. Train on representative data - Ensure training set matches production data
  2. Monitor for drift - Check model performance continuously
  3. Version models - Track all model versions and metrics
  4. Use validation sets - Prevent overfitting with proper validation
  5. Document assumptions - Record feature engineering decisions
  6. Plan for retraining - Update models regularly as patterns change

Summary

Advanced ML in HeliosDB enables:

  • SQL-native ML model training
  • Federated learning across organizations
  • AutoML for automatic model selection
  • Privacy-preserving techniques
  • Real-time inference at scale

Related Documentation: