"""Gradient Boosted Linear sklearn-compatible estimators."""
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Self, TypeVar
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
if TYPE_CHECKING:
from numpy.typing import NDArray
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
from boosters import Dataset, GBLinearConfig, GBLinearModel, Metric, Objective
if TYPE_CHECKING:
from collections.abc import Mapping
__all__ = ["GBLinearClassifier", "GBLinearRegressor"]
T = TypeVar("T", bound="_GBLinearEstimatorBase")
# =============================================================================
# Base Estimator
# =============================================================================
class _GBLinearEstimatorBase(BaseEstimator, ABC):
"""Base class for GBLinear estimators.
Handles common initialization, config creation, and fitting logic.
Subclasses define task-specific behavior (regression vs classification).
"""
# Instance attributes (declared for type checking)
model_: GBLinearModel
n_features_in_: int
@classmethod
@abstractmethod
def _get_default_objective(cls) -> Objective:
"""Return the default objective for this estimator type."""
...
@classmethod
@abstractmethod
def _get_default_metric(cls) -> Metric | None:
"""Return the default metric for this estimator type."""
...
@classmethod
@abstractmethod
def _validate_objective(cls, objective: Objective) -> None:
"""Validate objective is appropriate for this estimator type.
Raises:
ValueError: If objective is not valid for this estimator type.
"""
...
def __init__( # noqa: PLR0913 (sklearn estimators have many hyperparameters)
self,
n_estimators: int = 100,
learning_rate: float = 0.5,
l1: float = 0.0,
l2: float = 1.0,
early_stopping_rounds: int | None = None,
seed: int = 42,
n_threads: int = 0,
verbose: int = 1,
objective: Objective | None = None,
metric: Metric | None = None,
) -> None:
# Store all parameters (sklearn convention)
# Config is built at fit() time from these attributes to support set_params()
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.l1 = l1
self.l2 = l2
self.early_stopping_rounds = early_stopping_rounds
self.seed = seed
self.n_threads = n_threads
self.verbose = verbose
self.objective = objective
self.metric = metric
def _build_config(self, objective: Objective | None = None) -> GBLinearConfig:
"""Build config from current attributes.
Called at fit() time to ensure set_params() changes are reflected.
Parameters
----------
objective : Objective, optional
Override objective (used by classifier for multiclass).
"""
obj = objective if objective is not None else self.objective
if obj is None:
obj = self._get_default_objective()
met = self.metric if self.metric is not None else self._get_default_metric()
self._validate_objective(obj)
return GBLinearConfig(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
early_stopping_rounds=self.early_stopping_rounds,
seed=self.seed,
objective=obj,
metric=met,
l1=self.l1,
l2=self.l2,
)
@abstractmethod
def _prepare_targets(self, y: NDArray[Any]) -> tuple[NDArray[np.float32], Objective | None]:
"""Prepare targets for training.
For regressors, this simply casts to float32.
For classifiers, this performs label encoding.
Returns:
-------
y_prepared : ndarray of shape (n_samples,)
Prepared targets.
objective_override : Objective or None
Objective to use (e.g., softmax for multiclass), or None to use default.
"""
...
@abstractmethod
def _prepare_eval_targets(self, y: NDArray[Any]) -> NDArray[np.float32]:
"""Prepare evaluation set targets."""
...
def fit(
self,
X: NDArray[Any], # noqa: N803 (sklearn convention for feature matrix)
y: NDArray[Any],
eval_set: tuple[NDArray[Any], NDArray[Any]] | None = None,
sample_weight: NDArray[np.float32] | None = None,
) -> Self:
"""Fit the estimator.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training input samples.
y : array-like of shape (n_samples,)
Target values.
eval_set : tuple of (X, y), optional
Validation set as (X_val, y_val) tuple.
sample_weight : array-like of shape (n_samples,), optional
Sample weights.
Returns:
-------
self
Fitted estimator.
"""
X, y = check_X_y(X, y, dtype=np.float32) # noqa: N806 (sklearn convention)
self.n_features_in_ = X.shape[1]
# Prepare targets (handles label encoding for classifiers)
# Also returns objective override for multiclass classification
y_prepared, objective_override = self._prepare_targets(y)
# Build config at fit time to respect set_params() changes
config = self._build_config(objective=objective_override)
train_data = Dataset(X, y_prepared, weights=sample_weight)
val_data = self._build_val_set(eval_set)
self.model_ = GBLinearModel.train(
train_data,
config=config,
val_set=val_data,
n_threads=self.n_threads,
)
return self
def _build_val_set(self, eval_set: tuple[NDArray[Any], NDArray[Any]] | None) -> Dataset | None:
"""Build validation dataset from user input."""
if eval_set is None:
return None
X_val, y_val = eval_set # noqa: N806 (sklearn convention)
X_val = check_array(X_val, dtype=np.float32) # noqa: N806 (sklearn convention)
y_val_prepared = self._prepare_eval_targets(y_val)
return Dataset(X_val, y_val_prepared)
def predict(self, X: NDArray[Any]) -> NDArray[np.float32]: # noqa: N803 (sklearn convention)
"""Predict using the fitted model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input samples.
Returns:
-------
y_pred : ndarray of shape (n_samples,)
Predicted values.
"""
check_is_fitted(self, ["model_"])
X = check_array(X, dtype=np.float32) # noqa: N806 (sklearn convention)
preds: NDArray[np.float32] = self.model_.predict(Dataset(X))
return np.squeeze(preds, axis=-1)
@property
def coef_(self) -> NDArray[np.float32]:
"""Coefficient weights."""
check_is_fitted(self, ["model_"])
return self.model_.coef_
@property
def intercept_(self) -> NDArray[np.float32]:
"""Intercept (bias) term."""
check_is_fitted(self, ["model_"])
return self.model_.intercept_
# =============================================================================
# Regressor
# =============================================================================
[docs]
class GBLinearRegressor(_GBLinearEstimatorBase, RegressorMixin):
"""Gradient Boosted Linear Regressor.
A sklearn-compatible wrapper around GBLinearModel for linear regression.
Parameters
----------
n_estimators : int, default=100
Number of boosting rounds.
learning_rate : float, default=0.5
Step size for weight updates.
l1 : float, default=0.0
L1 regularization (alpha).
l2 : float, default=1.0
L2 regularization (lambda).
early_stopping_rounds : int or None, default=None
Stop if no improvement for this many rounds.
seed : int, default=42
Random seed.
objective : Objective or None, default=None
Loss function. Must be a regression objective.
If None, uses Objective.squared().
metric : Metric or None, default=None
Evaluation metric. If None, uses Metric.rmse().
Attributes:
----------
model_ : GBLinearModel
The fitted core model.
coef_ : ndarray of shape (n_features,)
Coefficient weights.
intercept_ : ndarray of shape (1,)
Intercept (bias) term.
n_features_in_ : int
Number of features seen during fit.
"""
_CLASSIFICATION_KEYWORDS = ("logistic", "softmax", "cross")
@classmethod
def _get_default_objective(cls) -> Objective:
return Objective.squared()
@classmethod
def _get_default_metric(cls) -> Metric | None:
return Metric.rmse()
@classmethod
def _validate_objective(cls, objective: Objective) -> None:
obj_name = str(objective).lower()
if any(x in obj_name for x in cls._CLASSIFICATION_KEYWORDS):
raise ValueError(
f"GBLinearRegressor requires a regression objective, got {objective}. "
f"Use Objective.squared(), etc. "
f"For classification, use GBLinearClassifier instead."
)
def _prepare_targets(self, y: NDArray[Any]) -> tuple[NDArray[np.float32], Objective | None]:
"""Prepare regression targets."""
return np.asarray(y, dtype=np.float32), None
def _prepare_eval_targets(self, y: NDArray[Any]) -> NDArray[np.float32]:
"""Prepare evaluation set targets for regression."""
return np.asarray(y, dtype=np.float32)
# =============================================================================
# Classifier
# =============================================================================
[docs]
class GBLinearClassifier(_GBLinearEstimatorBase, ClassifierMixin):
"""Gradient Boosted Linear Classifier.
A sklearn-compatible wrapper around GBLinearModel for classification.
Parameters
----------
n_estimators : int, default=100
Number of boosting rounds.
learning_rate : float, default=0.5
Step size for weight updates.
l1 : float, default=0.0
L1 regularization.
l2 : float, default=1.0
L2 regularization.
early_stopping_rounds : int or None, default=None
Stop if no improvement for this many rounds.
seed : int, default=42
Random seed.
objective : Objective or None, default=None
Loss function. Must be a classification objective.
If None, auto-detects: Objective.logistic() for binary,
Objective.softmax() for multiclass.
metric : Metric or None, default=None
Evaluation metric. If None, uses Metric.logloss().
Attributes:
----------
model_ : GBLinearModel
The fitted core model.
classes_ : ndarray
Unique class labels.
coef_ : ndarray
Coefficient weights.
intercept_ : ndarray
Intercept terms.
"""
# Additional instance attributes for classifier
classes_: NDArray[Any]
n_classes_: int
_label_to_idx: Mapping[Any, int]
_REGRESSION_KEYWORDS = (
"squared",
"absolute",
"huber",
"quantile",
"tweedie",
"poisson",
"gamma",
)
@classmethod
def _get_default_objective(cls) -> Objective:
return Objective.logistic()
@classmethod
def _get_default_metric(cls) -> Metric | None:
return Metric.logloss()
@classmethod
def _validate_objective(cls, objective: Objective) -> None:
obj_name = str(objective).lower()
if any(x in obj_name for x in cls._REGRESSION_KEYWORDS):
raise ValueError(
f"GBLinearClassifier requires a classification objective, got {objective}. "
f"Use Objective.logistic() for binary or Objective.softmax() for multiclass. "
f"For regression, use GBLinearRegressor instead."
)
def _prepare_targets(self, y: NDArray[Any]) -> tuple[NDArray[np.float32], Objective | None]:
"""Prepare classification targets with label encoding.
Returns softmax objective override for multiclass if user didn't specify.
"""
self.classes_ = np.unique(y)
self.n_classes_ = len(self.classes_)
self._label_to_idx = {c: i for i, c in enumerate(self.classes_)}
# Auto-switch to softmax for multiclass (if user didn't specify objective)
objective_override: Objective | None = None
if self.n_classes_ > 2 and self.objective is None:
objective_override = Objective.softmax(self.n_classes_)
y_encoded = np.array([self._label_to_idx[c] for c in y], dtype=np.float32)
return y_encoded, objective_override
def _prepare_eval_targets(self, y: NDArray[Any]) -> NDArray[np.float32]:
"""Prepare evaluation set targets with label encoding."""
return np.array([self._label_to_idx[c] for c in y], dtype=np.float32)
[docs]
def predict(self, X: NDArray[Any]) -> NDArray[Any]: # noqa: N803 (sklearn convention)
"""Predict class labels.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input samples.
Returns:
-------
y_pred : ndarray of shape (n_samples,)
Predicted class labels.
"""
check_is_fitted(self, ["model_", "classes_"])
proba = self.predict_proba(X)
# Binary vs multiclass classification threshold
if self.n_classes_ == 2: # noqa: SIM108 (ternary less readable here)
indices = (proba[:, 1] >= 0.5).astype(int)
else:
indices = np.argmax(proba, axis=1)
return self.classes_[indices]
[docs]
def predict_proba(self, X: NDArray[Any]) -> NDArray[np.float32]: # noqa: N803 (sklearn convention)
"""Predict class probabilities.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input samples.
Returns:
-------
proba : ndarray of shape (n_samples, n_classes)
Class probability estimates.
"""
check_is_fitted(self, ["model_"])
X = check_array(X, dtype=np.float32) # noqa: N806 (sklearn convention)
preds: NDArray[np.float32] = self.model_.predict(Dataset(X))
if self.n_classes_ == 2:
preds_1d = np.squeeze(preds, axis=-1)
proba: NDArray[np.float32] = np.column_stack([1 - preds_1d, preds_1d])
else:
proba = preds
return proba