AutoML: Automating the Machine Learning Pipeline
Machine learning practitioners spend the majority of their time on repetitive tasks: trying different algorithms, tuning hyperparameters, engineering features, and assembling ensembles. AutoML automates these steps, often matching or exceeding expert-level performance in a fraction of the time.
This lesson covers the major AutoML frameworks, their strengths and weaknesses, and practical guidance on when (and when not) to use them.
What AutoML Automates
The AutoML Landscape
Auto-sklearn
Built on top of scikit-learn, Auto-sklearn uses Bayesian optimization (SMAC) to search over sklearn's algorithm and hyperparameter space. It includes meta-learning (using past experience to warm-start the search) and automatic ensemble construction.Key features:
FLAML (Fast Lightweight AutoML)
Developed by Microsoft, FLAML is designed for speed. It uses a novel cost-frugal optimization approach that allocates more resources to promising configurations and less to poor ones.Key features:
H2O AutoML
An enterprise-grade AutoML platform with a powerful leaderboard and stacked ensemble construction.Key features:
AutoGluon (Amazon)
The most hands-off AutoML framework. Designed for practitioners who want maximum accuracy with minimal configuration.Key features:
TPOT (Tree-based Pipeline Optimization Tool)
Uses genetic programming to evolve sklearn pipelines.Key features:
Comparison Table
| Tool | Speed | Accuracy | Ease of Use | Best For |
|---|---|---|---|---|
| Auto-sklearn | Medium | High | Medium | Sklearn-compatible workflows |
| FLAML | Very Fast | Good | Easy | Quick experiments, resource constraints |
| H2O AutoML | Fast | High | Easy | Enterprise, large datasets |
| AutoGluon | Medium | Very High | Very Easy | Maximum accuracy, competitions |
| TPOT | Slow | Good | Medium | Explainable pipeline search |
| Google Cloud AutoML | N/A | High | Very Easy | No-code users, GCP ecosystem |
1# === AutoML Comparison Framework ===
2# We'll build a lightweight AutoML simulator that demonstrates
3# the core concepts: algorithm selection, HPO, and ensembling
4
5import numpy as np
6from sklearn.datasets import make_classification
7from sklearn.model_selection import cross_val_score, train_test_split
8from sklearn.ensemble import (
9 RandomForestClassifier, GradientBoostingClassifier,
10 AdaBoostClassifier, ExtraTreesClassifier,
11)
12from sklearn.linear_model import LogisticRegression
13from sklearn.svm import SVC
14from sklearn.neighbors import KNeighborsClassifier
15from sklearn.tree import DecisionTreeClassifier
16from sklearn.preprocessing import StandardScaler
17from sklearn.pipeline import Pipeline
18import time
19
20np.random.seed(42)
21
22# --- Create dataset ---
23X, y = make_classification(
24 n_samples=2000, n_features=20, n_informative=12,
25 n_redundant=4, n_classes=2, random_state=42
26)
27X_train, X_test, y_train, y_test = train_test_split(
28 X, y, test_size=0.2, random_state=42
29)
30
31# --- Define search space (like Auto-sklearn) ---
32search_space = {
33 "LogisticRegression": {
34 "model": LogisticRegression,
35 "params": [
36 {"C": 0.01, "max_iter": 1000},
37 {"C": 0.1, "max_iter": 1000},
38 {"C": 1.0, "max_iter": 1000},
39 {"C": 10.0, "max_iter": 1000},
40 ],
41 },
42 "RandomForest": {
43 "model": RandomForestClassifier,
44 "params": [
45 {"n_estimators": 50, "max_depth": 5, "random_state": 42},
46 {"n_estimators": 100, "max_depth": 10, "random_state": 42},
47 {"n_estimators": 200, "max_depth": None, "random_state": 42},
48 ],
49 },
50 "GradientBoosting": {
51 "model": GradientBoostingClassifier,
52 "params": [
53 {"n_estimators": 50, "learning_rate": 0.1, "max_depth": 3, "random_state": 42},
54 {"n_estimators": 100, "learning_rate": 0.05, "max_depth": 4, "random_state": 42},
55 {"n_estimators": 200, "learning_rate": 0.01, "max_depth": 5, "random_state": 42},
56 ],
57 },
58 "KNN": {
59 "model": KNeighborsClassifier,
60 "params": [
61 {"n_neighbors": 3},
62 {"n_neighbors": 7},
63 {"n_neighbors": 15},
64 ],
65 },
66 "ExtraTrees": {
67 "model": ExtraTreesClassifier,
68 "params": [
69 {"n_estimators": 100, "max_depth": 10, "random_state": 42},
70 {"n_estimators": 200, "max_depth": None, "random_state": 42},
71 ],
72 },
73}
74
75# --- Run AutoML search ---
76print("=== AutoML Search ===")
77print(f"Dataset: {X_train.shape[0]} train, {X_test.shape[0]} test, "
78 f"{X_train.shape[1]} features")
79print(f"Search space: {sum(len(v['params']) for v in search_space.values())} "
80 f"configurations across {len(search_space)} algorithms\n")
81
82results = []
83total_start = time.time()
84
85for algo_name, config in search_space.items():
86 for i, params in enumerate(config["params"]):
87 start = time.time()
88
89 # Create pipeline with scaling
90 pipe = Pipeline([
91 ("scaler", StandardScaler()),
92 ("model", config["model"](**params)),
93 ])
94
95 # Cross-validation score
96 cv_scores = cross_val_score(pipe, X_train, y_train,
97 cv=5, scoring="accuracy")
98 elapsed = time.time() - start
99
100 results.append({
101 "algorithm": algo_name,
102 "config_id": i,
103 "params": params,
104 "cv_mean": cv_scores.mean(),
105 "cv_std": cv_scores.std(),
106 "time": elapsed,
107 "pipeline": pipe,
108 })
109
110 print(f" {algo_name}[{i}]: {cv_scores.mean():.4f} "
111 f"+/- {cv_scores.std():.4f} ({elapsed:.2f}s)")
112
113total_time = time.time() - total_start
114
115# --- Leaderboard ---
116results.sort(key=lambda x: x["cv_mean"], reverse=True)
117
118print(f"\n=== Leaderboard (searched in {total_time:.1f}s) ===")
119print(f"{'Rank':<6} {'Algorithm':<20} {'CV Score':>10} {'Std':>8} "
120 f"{'Time':>8}")
121print("-" * 55)
122for rank, r in enumerate(results[:10], 1):
123 print(f"{rank:<6} {r['algorithm']:<20} {r['cv_mean']:>10.4f} "
124 f"{r['cv_std']:>8.4f} {r['time']:>7.2f}s")
125
126# --- Evaluate best model on test set ---
127best = results[0]
128best["pipeline"].fit(X_train, y_train)
129test_acc = best["pipeline"].score(X_test, y_test)
130print(f"\nBest model: {best['algorithm']} "
131 f"(CV={best['cv_mean']:.4f}, Test={test_acc:.4f})")Ensemble Construction
The best AutoML tools do not just pick one model — they build ensembles that combine multiple models for better performance.
Ensemble Methods in AutoML
Ensemble Selection (Auto-sklearn): Greedily selects models from the leaderboard to build an ensemble that maximizes cross-validation performance. Each model gets a weight proportional to its contribution.
Stacked Ensembles (H2O, AutoGluon): Train a meta-learner on the predictions of base models. AutoGluon uses multi-layer stacking: the first layer trains diverse models, the second layer stacks them, and optionally a third layer stacks the stacks.
Bagged Ensembles (AutoGluon): Each base model is trained on multiple folds and their predictions are averaged. This reduces variance without increasing bias.
When to Use AutoML
Good use cases
When manual ML is better
1# === FLAML-style Fast AutoML ===
2# FLAML's key innovation: cost-frugal optimization
3# Spend less time on bad configurations, more on promising ones
4
5import numpy as np
6from sklearn.datasets import make_classification
7from sklearn.model_selection import cross_val_score, train_test_split
8from sklearn.ensemble import (
9 RandomForestClassifier, GradientBoostingClassifier,
10 ExtraTreesClassifier,
11)
12from sklearn.linear_model import LogisticRegression
13from sklearn.preprocessing import StandardScaler
14from sklearn.pipeline import Pipeline
15import time
16
17np.random.seed(42)
18
19X, y = make_classification(
20 n_samples=3000, n_features=20, n_informative=12,
21 random_state=42
22)
23X_train, X_test, y_train, y_test = train_test_split(
24 X, y, test_size=0.2, random_state=42
25)
26
27class FLAMLStyleSearch:
28 """Cost-frugal AutoML that allocates more time to promising configs."""
29
30 def __init__(self, time_budget=30):
31 self.time_budget = time_budget
32 self.results = []
33
34 def _quick_evaluate(self, model_cls, params, X, y, n_folds=3):
35 """Quick evaluation with fewer folds."""
36 pipe = Pipeline([
37 ("scaler", StandardScaler()),
38 ("model", model_cls(**params)),
39 ])
40 scores = cross_val_score(pipe, X, y, cv=n_folds,
41 scoring="accuracy")
42 return scores.mean(), scores.std(), pipe
43
44 def fit(self, X, y):
45 start_time = time.time()
46
47 # Phase 1: Quick screening (2-fold CV, small data sample)
48 print("Phase 1: Quick screening...")
49 sample_size = min(500, len(X))
50 idx = np.random.choice(len(X), sample_size, replace=False)
51 X_sample, y_sample = X[idx], y[idx]
52
53 candidates = [
54 ("LogReg", LogisticRegression, {"C": 1.0, "max_iter": 500}),
55 ("RF-small", RandomForestClassifier, {"n_estimators": 30, "max_depth": 5, "random_state": 42}),
56 ("RF-med", RandomForestClassifier, {"n_estimators": 100, "max_depth": 10, "random_state": 42}),
57 ("GB-fast", GradientBoostingClassifier, {"n_estimators": 30, "learning_rate": 0.1, "max_depth": 3, "random_state": 42}),
58 ("GB-med", GradientBoostingClassifier, {"n_estimators": 100, "learning_rate": 0.05, "max_depth": 4, "random_state": 42}),
59 ("ET-med", ExtraTreesClassifier, {"n_estimators": 100, "max_depth": 10, "random_state": 42}),
60 ]
61
62 phase1_results = []
63 for name, cls, params in candidates:
64 if time.time() - start_time > self.time_budget * 0.3:
65 break
66 t = time.time()
67 score, std, pipe = self._quick_evaluate(
68 cls, params, X_sample, y_sample, n_folds=2
69 )
70 elapsed = time.time() - t
71 phase1_results.append({
72 "name": name, "cls": cls, "params": params,
73 "score": score, "time": elapsed,
74 })
75 print(f" {name}: {score:.4f} ({elapsed:.2f}s)")
76
77 # Phase 2: Deep evaluation of top candidates (5-fold, full data)
78 print("\nPhase 2: Deep evaluation of top candidates...")
79 phase1_results.sort(key=lambda x: x["score"], reverse=True)
80 top_k = min(3, len(phase1_results))
81
82 for r in phase1_results[:top_k]:
83 if time.time() - start_time > self.time_budget * 0.8:
84 break
85 t = time.time()
86 score, std, pipe = self._quick_evaluate(
87 r["cls"], r["params"], X, y, n_folds=5
88 )
89 elapsed = time.time() - t
90 self.results.append({
91 "name": r["name"], "score": score, "std": std,
92 "time": elapsed, "pipeline": pipe, "params": r["params"],
93 })
94 print(f" {r['name']}: {score:.4f} +/- {std:.4f} ({elapsed:.2f}s)")
95
96 # Phase 3: Refine best candidate
97 if self.results:
98 self.results.sort(key=lambda x: x["score"], reverse=True)
99 best = self.results[0]
100 print(f"\nPhase 3: Refining {best['name']}...")
101
102 # Try nearby hyperparameters
103 if "n_estimators" in best["params"]:
104 for n_est in [best["params"]["n_estimators"] * 2]:
105 if time.time() - start_time > self.time_budget:
106 break
107 refined_params = {**best["params"], "n_estimators": n_est}
108 t = time.time()
109 score, std, pipe = self._quick_evaluate(
110 best["pipeline"].named_steps["model"].__class__,
111 refined_params, X, y, n_folds=5
112 )
113 elapsed = time.time() - t
114 self.results.append({
115 "name": f"{best['name']}-refined",
116 "score": score, "std": std,
117 "time": elapsed, "pipeline": pipe,
118 "params": refined_params,
119 })
120 print(f" {best['name']}-refined: {score:.4f} "
121 f"+/- {std:.4f} ({elapsed:.2f}s)")
122
123 total = time.time() - start_time
124 self.results.sort(key=lambda x: x["score"], reverse=True)
125 print(f"\nCompleted in {total:.1f}s")
126 return self
127
128 def predict(self, X):
129 best = self.results[0]
130 return best["pipeline"].predict(X)
131
132 def leaderboard(self):
133 print("\n=== FLAML Leaderboard ===")
134 for i, r in enumerate(self.results, 1):
135 print(f" {i}. {r['name']:<25} "
136 f"CV={r['score']:.4f} +/- {r['std']:.4f}")
137
138# --- Run FLAML-style search ---
139automl = FLAMLStyleSearch(time_budget=60)
140automl.fit(X_train, y_train)
141automl.leaderboard()
142
143# Test set evaluation
144best_pipe = automl.results[0]["pipeline"]
145best_pipe.fit(X_train, y_train)
146test_score = best_pipe.score(X_test, y_test)
147print(f"\nBest model test accuracy: {test_score:.4f}")