Recipes#
Quick solutions to common tasks.
Cross-Validation#
from sklearn.model_selection import cross_val_score
from boosters.sklearn import GBDTRegressor
model = GBDTRegressor(n_estimators=100)
scores = cross_val_score(model, X, y, cv=5, scoring="neg_root_mean_squared_error")
print(f"RMSE: {-scores.mean():.4f} ± {scores.std():.4f}")
Hyperparameter Search#
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from boosters.sklearn import GBDTRegressor
param_dist = {
"n_estimators": randint(50, 200),
"max_depth": randint(3, 8),
"learning_rate": uniform(0.01, 0.3),
"subsample": uniform(0.6, 0.4),
}
search = RandomizedSearchCV(
GBDTRegressor(),
param_dist,
n_iter=20,
cv=3,
scoring="neg_root_mean_squared_error",
n_jobs=1, # Use n_jobs=1 to avoid pickling issues with Rust models
)
search.fit(X, y)
print(f"Best params: {search.best_params_}")
Feature Importance#
from boosters.sklearn import GBDTClassifier
import matplotlib.pyplot as plt
model = GBDTClassifier()
model.fit(X_train, y_train)
# Get feature importance
importance = model.feature_importances_
# Plot
plt.barh(range(len(importance)), importance)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
Pipeline with Preprocessing#
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from boosters.sklearn import GBDTClassifier
preprocessor = ColumnTransformer([
("num", StandardScaler(), numeric_features),
("cat", OneHotEncoder(drop="first"), categorical_features),
])
pipeline = Pipeline([
("preprocess", preprocessor),
("model", GBDTClassifier(n_estimators=100)),
])
pipeline.fit(X_train, y_train)
Save and Load with Pipeline#
import joblib
# Save entire pipeline
joblib.dump(pipeline, "pipeline.joblib")
# Load
pipeline = joblib.load("pipeline.joblib")
predictions = pipeline.predict(X_new)
Multi-Output Regression#
from sklearn.multioutput import MultiOutputRegressor
from boosters.sklearn import GBDTRegressor
model = MultiOutputRegressor(GBDTRegressor(n_estimators=100))
model.fit(X_train, y_train_multi) # y has multiple columns
predictions = model.predict(X_test)
Class Imbalance#
from sklearn.utils.class_weight import compute_sample_weight
from boosters.sklearn import GBDTClassifier
# Compute sample weights
weights = compute_sample_weight("balanced", y_train)
model = GBDTClassifier(n_estimators=100)
model.fit(X_train, y_train, sample_weight=weights)
Time Series Split#
from sklearn.model_selection import TimeSeriesSplit
from boosters.sklearn import GBDTRegressor
tscv = TimeSeriesSplit(n_splits=5)
scores = []
for train_idx, val_idx in tscv.split(X):
model = GBDTRegressor(n_estimators=100)
model.fit(X[train_idx], y[train_idx])
score = model.score(X[val_idx], y[val_idx])
scores.append(score)
print(f"Mean R²: {np.mean(scores):.4f}")
Probability Calibration#
from sklearn.calibration import CalibratedClassifierCV
from boosters.sklearn import GBDTClassifier
model = GBDTClassifier(n_estimators=100)
calibrated = CalibratedClassifierCV(model, cv=5, method="isotonic")
calibrated.fit(X_train, y_train)
probabilities = calibrated.predict_proba(X_test)
Model Comparison#
from sklearn.model_selection import cross_validate
from boosters.sklearn import GBDTRegressor, GBLinearRegressor
models = {
"GBDT": GBDTRegressor(n_estimators=100),
"GBLinear": GBLinearRegressor(n_estimators=100),
}
for name, model in models.items():
cv_results = cross_validate(
model, X, y, cv=5,
scoring="neg_root_mean_squared_error",
return_train_score=True,
)
print(f"{name}:")
print(f" Train RMSE: {-cv_results['train_score'].mean():.4f}")
print(f" Test RMSE: {-cv_results['test_score'].mean():.4f}")