Classical
Machine Learning
The algorithms that power real production systems. Credit scoring, fraud detection, recommendation engines, medical diagnosis — all built on the techniques you'll master here. Classical ML works, generalizes well, and is explainable.
Learning
Learning
Evaluation
Engineering
Methods
Learn from Labeled Examples
to Make Predictions
Given input features X and target labels y, find a function f such that f(X) ≈ y on unseen data. This is the foundation of 90% of production ML systems.
| Algorithm | Best For | Interpretable | Needs Scaling | Handles Missing |
|---|---|---|---|---|
| Linear Regression | Continuous output, linear data | Yes | Yes | No |
| Logistic Regression | Binary classification baseline | Yes | Yes | No |
| Decision Tree | Non-linear, categorical data | Yes | No | No |
| Random Forest | Tabular data general use | Partial | No | No |
| SVM | High-dim, small datasets | No | Yes | No |
| KNN | Low-dimensional, no training | Yes | Yes | No |
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso from sklearn.tree import DecisionTreeClassifier, export_text from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, mean_squared_error from sklearn.preprocessing import StandardScaler from sklearn.datasets import fetch_california_housing, load_breast_cancer import numpy as np # ── 1. LINEAR REGRESSION ────────────────────────────────────────────── X, y = fetch_california_housing(return_X_y=True) X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42) lr = LinearRegression() lr.fit(X_tr, y_tr) rmse = np.sqrt(mean_squared_error(y_te, lr.predict(X_te))) print(f"Linear Regression RMSE: {rmse:.3f}") print(f"Coefficients: {lr.coef_.round(3)}") # Ridge (L2 regularization) — penalises large weights ridge = Ridge(alpha=1.0) ridge.fit(X_tr, y_tr) # Lasso (L1 regularization) — drives small weights to exactly 0 lasso = Lasso(alpha=0.01) lasso.fit(X_tr, y_tr) print(f"Lasso zeros: {(lasso.coef_ == 0).sum()} / {len(lasso.coef_)}") # ── 2. LOGISTIC REGRESSION ─────────────────────────────────────────── X, y = load_breast_cancer(return_X_y=True) X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) # Scale first — logistic regression benefits from scaling scaler = StandardScaler() X_tr_s = scaler.fit_transform(X_tr) X_te_s = scaler.transform(X_te) logr = LogisticRegression(C=1.0, max_iter=1000) logr.fit(X_tr_s, y_tr) print(f"Logistic Regression Accuracy: {accuracy_score(y_te, logr.predict(X_te_s)):.3f}") # Get predicted probabilities (important for calibration) probs = logr.predict_proba(X_te_s)[:, 1] # P(class=1) # ── 3. DECISION TREE ───────────────────────────────────────────────── tree = DecisionTreeClassifier( max_depth=5, # prevent overfitting min_samples_leaf=10, # at least 10 samples per leaf criterion='gini', # or 'entropy' random_state=42 ) tree.fit(X_tr, y_tr) print(f"Decision Tree Accuracy: {accuracy_score(y_te, tree.predict(X_te)):.3f}") # Print human-readable tree rules feature_names = load_breast_cancer().feature_names rules = export_text(tree, feature_names=list(feature_names), max_depth=3) print(rules) # ── 4. RANDOM FOREST ───────────────────────────────────────────────── rf = RandomForestClassifier( n_estimators=200, # number of trees max_depth=8, min_samples_leaf=5, max_features='sqrt', # random feature subset per split n_jobs=-1, # use all CPU cores random_state=42 ) rf.fit(X_tr, y_tr) print(f"Random Forest Accuracy: {accuracy_score(y_te, rf.predict(X_te)):.3f}") # Feature importances importances = rf.feature_importances_ top5 = np.argsort(importances)[:-6:-1] for i in top5: print(f" {feature_names[i]:30s} {importances[i]:.4f}") # ── 5. SVM ─────────────────────────────────────────────────────────── svm = SVC(kernel='rbf', C=10, gamma='scale', probability=True) svm.fit(X_tr_s, y_tr) # SVMs require scaled features! print(f"SVM Accuracy: {accuracy_score(y_te, svm.predict(X_te_s)):.3f}") # ── 6. KNN ─────────────────────────────────────────────────────────── knn = KNeighborsClassifier(n_neighbors=7, metric='euclidean') knn.fit(X_tr_s, y_tr) # KNN also requires scaled features! print(f"KNN Accuracy: {accuracy_score(y_te, knn.predict(X_te_s)):.3f}") # ── 7. COMPARE ALL IN ONE SHOT ─────────────────────────────────────── models = { 'Logistic Regression': LogisticRegression(max_iter=1000), 'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42), 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), 'SVM': SVC(kernel='rbf', gamma='scale'), 'KNN': KNeighborsClassifier(n_neighbors=7), } for name, model in models.items(): model.fit(X_tr_s, y_tr) acc = accuracy_score(y_te, model.predict(X_te_s)) print(f" {name:25s}: {acc:.3f}")
stratify=y ensures class ratios are preserved. Critical for imbalanced datasets.
Find Hidden Structure
in Unlabeled Data
No labels required. Clustering discovers natural groupings, dimensionality reduction reveals structure, and anomaly detection finds the unusual. Most data in the world is unlabeled — this is how you use it.
Silhouette Score: Ranges from -1 to 1. Higher = better defined clusters. Use
sklearn.metrics.silhouette_score(X, labels). Choose K that maximizes it.
import numpy as np from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.mixture import GaussianMixture from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.ensemble import IsolationForest from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score from sklearn.datasets import load_digits, make_blobs # ── 1. K-MEANS ──────────────────────────────────────────────────────── X, y_true = make_blobs(n_samples=500, n_features=2, centers=4, random_state=42) # ALWAYS scale before K-Means scaler = StandardScaler() X_s = scaler.fit_transform(X) # Elbow method — find optimal K inertias, sil_scores = [], [] K_range = range(2, 10) for k in K_range: km = KMeans(n_clusters=k, n_init=10, random_state=42) km.fit(X_s) inertias.append(km.inertia_) sil_scores.append(silhouette_score(X_s, km.labels_)) best_k = K_range.start + np.argmax(sil_scores) print(f"Best K by silhouette: {best_k}") # Fit final model km = KMeans(n_clusters=best_k, n_init=20, random_state=42) labels = km.fit_predict(X_s) print(f"Cluster sizes: {np.bincount(labels)}") print(f"Inertia: {km.inertia_:.1f}") # ── 2. DBSCAN — density-based, no K needed ──────────────────────────── db = DBSCAN(eps=0.5, min_samples=5) db_labels = db.fit_predict(X_s) n_clusters = len(set(db_labels)) - (1 if -1 in db_labels else 0) n_noise = (db_labels == -1).sum() print(f"DBSCAN: {n_clusters} clusters, {n_noise} noise points") # ── 3. GAUSSIAN MIXTURE MODEL — soft probabilistic clusters ────────── gmm = GaussianMixture(n_components=4, covariance_type='full', random_state=42) gmm.fit(X_s) gmm_labels = gmm.predict(X_s) probs = gmm.predict_proba(X_s) # shape (n, k) — soft assignments print(f"GMM BIC: {gmm.bic(X_s):.1f}") # lower = better fit # ── 4. PCA — dimensionality reduction ───────────────────────────────── X_digits, y_digits = load_digits(return_X_y=True) # 1797 × 64 pca = PCA(n_components=0.95) # keep 95% of variance X_pca = pca.fit_transform(X_digits) print(f"PCA: {X_digits.shape[1]} → {X_pca.shape[1]} dims") print(f"Variance explained: {pca.explained_variance_ratio_.sum():.2%}") # Manual PCA via SVD — understand what sklearn does under the hood X_c = X_digits - X_digits.mean(axis=0) U, S, Vt = np.linalg.svd(X_c, full_matrices=False) X_manual_pca = X_c @ Vt[:20].T # top 20 components # ── 5. t-SNE — 2D visualization ─────────────────────────────────────── # Best practice: PCA first to ~50 dims, then t-SNE (much faster) X_50 = PCA(n_components=50).fit_transform(X_digits) X_2d = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=1000).fit_transform(X_50) print(f"t-SNE shape: {X_2d.shape}") # (1797, 2) # ── 6. ISOLATION FOREST — anomaly detection ─────────────────────────── # contamination = expected fraction of outliers in your data iso = IsolationForest(n_estimators=200, contamination=0.05, random_state=42) iso.fit(X_s) # -1 = anomaly, +1 = normal anomaly_labels = iso.predict(X_s) anomaly_scores = iso.decision_function(X_s) # lower = more anomalous print(f"Anomalies detected: {(anomaly_labels == -1).sum()}")
Measure Performance
Honestly and Correctly
A model that scores 99% accuracy on imbalanced data might never predict the minority class. Choosing the right metric and validation strategy separates real ML engineers from beginners.
| Problem Type | Good Metric | Avoid | When to Use Precision vs Recall |
|---|---|---|---|
| Balanced classification | Accuracy, F1 | — | Accuracy is fine |
| Imbalanced classification | ROC-AUC, PR-AUC | Accuracy | PR-AUC when positives are rare |
| Fraud detection | Recall, PR-AUC | Accuracy | Prioritize recall (catch all fraud) |
| Spam filter | Precision, F1 | Accuracy | Prioritize precision (don't block real mail) |
| Regression | RMSE or MAE | R² alone | MAE if outliers exist in y |
| Ranking | NDCG, MAP | Accuracy | Order matters |
import numpy as np from sklearn.model_selection import ( cross_val_score, StratifiedKFold, KFold, learning_curve, validation_curve ) from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, classification_report, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score, ConfusionMatrixDisplay, RocCurveDisplay ) from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_breast_cancer from sklearn.preprocessing import StandardScaler X, y = load_breast_cancer(return_X_y=True) scaler = StandardScaler() X = scaler.fit_transform(X) # ── 1. STRATIFIED K-FOLD CROSS-VALIDATION ───────────────────────────── model = RandomForestClassifier(n_estimators=100, random_state=42) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Score multiple metrics in one CV run from sklearn.model_selection import cross_validate results = cross_validate(model, X, y, cv=cv, scoring=['accuracy', 'roc_auc', 'f1', 'precision', 'recall'], return_train_score=True) for metric in ['accuracy', 'roc_auc', 'f1']: val = results[f'test_{metric}'] train = results[f'train_{metric}'] print(f"{metric:12s} val: {val.mean():.3f} ± {val.std():.3f} " f"train: {train.mean():.3f}") # If train >> val: overfitting. If both low: underfitting. # ── 2. THRESHOLD-BASED METRICS ──────────────────────────────────────── from sklearn.model_selection import train_test_split X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) model.fit(X_tr, y_tr) y_pred = model.predict(X_te) y_prob = model.predict_proba(X_te)[:, 1] print(classification_report(y_te, y_pred, target_names=['benign', 'malignant'])) print(f"ROC-AUC: {roc_auc_score(y_te, y_prob):.4f}") print(f"PR-AUC: {average_precision_score(y_te, y_prob):.4f}") # Confusion matrix cm = confusion_matrix(y_te, y_pred) print("Confusion Matrix:") print(cm) # [[TN, FP], # [FN, TP]] # ── 3. CUSTOM THRESHOLD — precision/recall tradeoff ─────────────────── from sklearn.metrics import precision_recall_curve precisions, recalls, thresholds = precision_recall_curve(y_te, y_prob) # Find threshold for ≥0.90 precision idx = np.argmax(precisions >= 0.90) optimal_threshold = thresholds[idx] print(f"Threshold for 90% precision: {optimal_threshold:.3f}") y_pred_custom = (y_prob >= optimal_threshold).astype(int) print(f"Custom threshold — precision: {precision_score(y_te, y_pred_custom):.3f}, " f"recall: {recall_score(y_te, y_pred_custom):.3f}") # ── 4. LEARNING CURVES — diagnose bias vs variance ──────────────────── train_sizes, train_scores, val_scores = learning_curve( model, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10), scoring='roc_auc', n_jobs=-1) # Plot: if train high, val low → overfit (need regularization/more data) # if both low → underfit (need more complex model) # if both high and close → ideal # ── 5. REGRESSION METRICS ───────────────────────────────────────────── from sklearn.datasets import fetch_california_housing from sklearn.ensemble import RandomForestRegressor X_r, y_r = fetch_california_housing(return_X_y=True) X_r_tr, X_r_te, y_r_tr, y_r_te = train_test_split(X_r, y_r, test_size=0.2, random_state=42) reg = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_r_tr, y_r_tr) y_r_pred = reg.predict(X_r_te) rmse = np.sqrt(mean_squared_error(y_r_te, y_r_pred)) mae = mean_absolute_error(y_r_te, y_r_pred) r2 = r2_score(y_r_te, y_r_pred) print(f"RMSE: {rmse:.3f} MAE: {mae:.3f} R²: {r2:.3f}") # ── 6. DATA LEAKAGE — the silent killer ─────────────────────────────── # WRONG: fit scaler on all data, then split from sklearn.preprocessing import StandardScaler scaler_bad = StandardScaler() X_bad = scaler_bad.fit_transform(X) # ← leaks test stats into training! X_bad_tr, X_bad_te = X_bad[:400], X_bad[400:] # RIGHT: fit scaler only on train, transform test X_raw_tr, X_raw_te = X[:400], X[400:] scaler_good = StandardScaler() X_good_tr = scaler_good.fit_transform(X_raw_tr) X_good_te = scaler_good.transform(X_raw_te) # ← correct! # Or best of all: use a Pipeline from sklearn.pipeline import Pipeline pipe = Pipeline([ ('scaler', StandardScaler()), ('model', RandomForestClassifier(random_state=42)) ]) cv_scores = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc') print(f"Pipeline CV AUC: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
Create Features That
Make Models Smarter
Better features beat better algorithms. A weak model with great features consistently outperforms a complex model on raw data. Feature engineering is where domain expertise meets machine learning.
sklearn.preprocessing.TargetEncoder inside a Pipeline, or compute manually inside each CV fold.
import pandas as pd import numpy as np from sklearn.preprocessing import ( StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder, PolynomialFeatures ) from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif from sklearn.impute import SimpleImputer, KNNImputer from sklearn.model_selection import train_test_split # ── SAMPLE DATASET ───────────────────────────────────────────────────── np.random.seed(42) n = 1000 df = pd.DataFrame({ 'age': np.random.randint(18, 75, n), 'income': np.random.exponential(50000, n), 'city': np.random.choice(['NY','LA','SF','CHI','HOU'], n), 'education': np.random.choice(['HS','BA','MS','PhD'], n), 'join_date': pd.date_range('2020-01-01', periods=n, freq='D'), 'score': np.random.randint(400, 850, n), 'target': np.random.randint(0, 2, n) }) # Inject missing values df.loc[np.random.choice(n, 50, replace=False), 'age'] = np.nan # ── 1. BASIC TRANSFORMS ──────────────────────────────────────────────── # Log transform — right-skewed income df['log_income'] = np.log1p(df['income']) # Binning age df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100], labels=['young','mid','senior','elder']) # Interaction terms df['income_per_age'] = df['income'] / (df['age'] + 1) df['score_x_income'] = df['score'] * df['log_income'] # ── 2. DATE FEATURES ─────────────────────────────────────────────────── df['year'] = df['join_date'].dt.year df['month'] = df['join_date'].dt.month df['dayofweek'] = df['join_date'].dt.dayofweek df['is_weekend'] = (df['dayofweek'] >= 5).astype(int) df['quarter'] = df['join_date'].dt.quarter df['days_since_join'] = (pd.Timestamp('2024-01-01') - df['join_date']).dt.days # ── 3. ORDINAL ENCODING ──────────────────────────────────────────────── edu_order = [['HS', 'BA', 'MS', 'PhD']] oe = OrdinalEncoder(categories=edu_order) df['edu_encoded'] = oe.fit_transform(df[['education']]) # ── 4. SKLEARN PIPELINE — the right way ─────────────────────────────── X = df.drop(columns=['target', 'join_date', 'city', 'education', 'age_group']) y = df['target'] X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42) numeric_cols = ['age', 'income', 'log_income', 'score', 'days_since_join'] numeric_pipe = Pipeline([ ('impute', SimpleImputer(strategy='median')), ('scale', StandardScaler()), ]) preprocessor = ColumnTransformer([ ('num', numeric_pipe, numeric_cols), ], remainder='passthrough') from sklearn.ensemble import RandomForestClassifier full_pipe = Pipeline([ ('prep', preprocessor), ('model', RandomForestClassifier(n_estimators=100, random_state=42)) ]) full_pipe.fit(X_tr, y_tr) # ── 5. FULL ColumnTransformer with multiple column types ─────────────── df2 = df.copy() num_features = ['age', 'income', 'score'] cat_nominal = ['city'] cat_ordinal = ['education'] preprocessor2 = ColumnTransformer([ ('num', Pipeline([ ('impute', SimpleImputer(strategy='median')), ('log', __import__('sklearn.preprocessing', fromlist=['FunctionTransformer']).FunctionTransformer(np.log1p)), ('scale', StandardScaler()), ]), num_features), ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_nominal), ('ord', OrdinalEncoder(categories=edu_order), cat_ordinal), ]) # ── 6. FEATURE SELECTION ─────────────────────────────────────────────── X_prep = preprocessor.fit_transform(X_tr, y_tr) # Select top-K features by ANOVA F-score selector = SelectKBest(score_func=f_classif, k=5) X_selected = selector.fit_transform(X_prep, y_tr) print(f"Selected {X_selected.shape[1]} features from {X_prep.shape[1]}") # Mutual information — works for non-linear relationships too mi_scores = mutual_info_classif(X_prep, y_tr, random_state=42) print(f"Top MI scores: {sorted(mi_scores, reverse=True)[:5]}") # ── 7. POLYNOMIAL FEATURES ───────────────────────────────────────────── poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False) X_poly = poly.fit_transform(X_prep[:, :3]) # first 3 features print(f"Polynomial features: {X_prep.shape[1]} → {X_poly.shape[1]}")
.predict() call handles everything.
np.log1p handles zeros gracefully unlike np.log.
Combine Weak Learners
into Powerful Models
XGBoost and LightGBM win most tabular ML competitions. Gradient boosting is the dominant paradigm for structured data in industry. Understanding how it works helps you tune it masterfully.
| Parameter | Effect | Typical Range | Direction if Overfitting |
|---|---|---|---|
| learning_rate (η) | Step size. Smaller = more trees needed but generalizes better. | 0.01 – 0.3 | ↓ decrease |
| n_estimators | Number of trees. Use with early stopping. | 100 – 5000 | Use early stopping |
| max_depth | Tree depth. Deeper = more complex interactions. | 3 – 8 | ↓ decrease |
| subsample | Fraction of training rows per tree. Adds stochasticity. | 0.6 – 1.0 | ↓ decrease |
| colsample_bytree | Fraction of features per tree. Like RF's max_features. | 0.6 – 1.0 | ↓ decrease |
| reg_lambda (L2) | L2 regularization on leaf weights. Prevents large weights. | 0.1 – 10 | ↑ increase |
| reg_alpha (L1) | L1 regularization. Sparse leaf weights. | 0 – 1 | ↑ increase |
import numpy as np import xgboost as xgb import lightgbm as lgb from catboost import CatBoostClassifier from sklearn.ensemble import ( RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier ) from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import roc_auc_score from sklearn.datasets import load_breast_cancer import optuna X, y = load_breast_cancer(return_X_y=True) X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2, stratify=y_tr, random_state=42) # ── 1. XGBOOST — with early stopping ────────────────────────────────── xgb_model = xgb.XGBClassifier( n_estimators=2000, max_depth=4, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, reg_lambda=1.5, reg_alpha=0.1, min_child_weight=3, eval_metric='auc', early_stopping_rounds=50, random_state=42, use_label_encoder=False, verbosity=0 ) xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False) xgb_auc = roc_auc_score(y_te, xgb_model.predict_proba(X_te)[:, 1]) print(f"XGBoost AUC: {xgb_auc:.4f} Best round: {xgb_model.best_iteration}") # Feature importance feat_imp = xgb_model.feature_importances_ top_idx = np.argsort(feat_imp)[:-6:-1] print("Top-5 features:", load_breast_cancer().feature_names[top_idx]) # ── 2. LIGHTGBM — faster, better for large data ─────────────────────── lgb_model = lgb.LGBMClassifier( n_estimators=2000, num_leaves=31, # LightGBM uses num_leaves, not max_depth learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, min_child_samples=20, random_state=42, verbose=-1 ) lgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(-1)]) lgb_auc = roc_auc_score(y_te, lgb_model.predict_proba(X_te)[:, 1]) print(f"LightGBM AUC: {lgb_auc:.4f}") # ── 3. CATBOOST — native categorical support ────────────────────────── cb_model = CatBoostClassifier( iterations=1000, learning_rate=0.05, depth=6, eval_metric='AUC', early_stopping_rounds=50, random_seed=42, verbose=0 ) cb_model.fit(X_tr, y_tr, eval_set=(X_val, y_val)) cb_auc = roc_auc_score(y_te, cb_model.predict_proba(X_te)[:, 1]) print(f"CatBoost AUC: {cb_auc:.4f}") # ── 4. VOTING ENSEMBLE ──────────────────────────────────────────────── rf = RandomForestClassifier(n_estimators=200, random_state=42) xgb2 = xgb.XGBClassifier(n_estimators=200, random_state=42, verbosity=0) lgb2 = lgb.LGBMClassifier(n_estimators=200, random_state=42, verbose=-1) voting = VotingClassifier( estimators=[('rf', rf), ('xgb', xgb2), ('lgb', lgb2)], voting='soft' # average probabilities, not votes ) voting.fit(X_tr, y_tr) vote_auc = roc_auc_score(y_te, voting.predict_proba(X_te)[:, 1]) print(f"Voting Ensemble AUC: {vote_auc:.4f}") # ── 5. STACKING ─────────────────────────────────────────────────────── estimators = [ ('rf', RandomForestClassifier(n_estimators=200, random_state=42)), ('xgb', xgb.XGBClassifier(n_estimators=200, random_state=42, verbosity=0)), ('lgb', lgb.LGBMClassifier(n_estimators=200, random_state=42, verbose=-1)), ] stacker = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(C=0.1), # meta-learner cv=5, passthrough=True # include original features alongside base predictions ) stacker.fit(X_tr, y_tr) stack_auc = roc_auc_score(y_te, stacker.predict_proba(X_te)[:, 1]) print(f"Stacking AUC: {stack_auc:.4f}") # ── 6. OPTUNA HYPERPARAMETER OPTIMIZATION ───────────────────────────── def objective(trial): params = { 'n_estimators': trial.suggest_int('n_estimators', 100, 1000), 'max_depth': trial.suggest_int('max_depth', 3, 8), 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True), 'subsample': trial.suggest_float('subsample', 0.5, 1.0), 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0), 'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10.0, log=True), 'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0), } model = xgb.XGBClassifier(**params, random_state=42, verbosity=0) score = cross_val_score(model, X_tr, y_tr, cv=5, scoring='roc_auc') return score.mean() study = optuna.create_study(direction='maximize') study.optimize(objective, n_trials=50, show_progress_bar=True) print(f"Best AUC: {study.best_value:.4f}") print(f"Best params: {study.best_params}") # Train final model with best params best = xgb.XGBClassifier(**study.best_params, random_state=42, verbosity=0) best.fit(X_tr, y_tr) final_auc = roc_auc_score(y_te, best.predict_proba(X_te)[:, 1]) print(f"Final XGBoost AUC on test: {final_auc:.4f}")