Source code for boosters.sklearn.gblinear

"""Gradient Boosted Linear sklearn-compatible estimators."""

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Self, TypeVar

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin

if TYPE_CHECKING:
    from numpy.typing import NDArray
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

from boosters import Dataset, GBLinearConfig, GBLinearModel, Metric, Objective

if TYPE_CHECKING:
    from collections.abc import Mapping

__all__ = ["GBLinearClassifier", "GBLinearRegressor"]

T = TypeVar("T", bound="_GBLinearEstimatorBase")


# =============================================================================
# Base Estimator
# =============================================================================


class _GBLinearEstimatorBase(BaseEstimator, ABC):
    """Base class for GBLinear estimators.

    Handles common initialization, config creation, and fitting logic.
    Subclasses define task-specific behavior (regression vs classification).
    """

    # Instance attributes (declared for type checking)
    model_: GBLinearModel
    n_features_in_: int

    @classmethod
    @abstractmethod
    def _get_default_objective(cls) -> Objective:
        """Return the default objective for this estimator type."""
        ...

    @classmethod
    @abstractmethod
    def _get_default_metric(cls) -> Metric | None:
        """Return the default metric for this estimator type."""
        ...

    @classmethod
    @abstractmethod
    def _validate_objective(cls, objective: Objective) -> None:
        """Validate objective is appropriate for this estimator type.

        Raises:
            ValueError: If objective is not valid for this estimator type.
        """
        ...

    def __init__(  # noqa: PLR0913 (sklearn estimators have many hyperparameters)
        self,
        n_estimators: int = 100,
        learning_rate: float = 0.5,
        l1: float = 0.0,
        l2: float = 1.0,
        early_stopping_rounds: int | None = None,
        seed: int = 42,
        n_threads: int = 0,
        verbose: int = 1,
        objective: Objective | None = None,
        metric: Metric | None = None,
    ) -> None:
        # Store all parameters (sklearn convention)
        # Config is built at fit() time from these attributes to support set_params()
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.l1 = l1
        self.l2 = l2
        self.early_stopping_rounds = early_stopping_rounds
        self.seed = seed
        self.n_threads = n_threads
        self.verbose = verbose
        self.objective = objective
        self.metric = metric

    def _build_config(self, objective: Objective | None = None) -> GBLinearConfig:
        """Build config from current attributes.

        Called at fit() time to ensure set_params() changes are reflected.

        Parameters
        ----------
        objective : Objective, optional
            Override objective (used by classifier for multiclass).
        """
        obj = objective if objective is not None else self.objective
        if obj is None:
            obj = self._get_default_objective()
        met = self.metric if self.metric is not None else self._get_default_metric()
        self._validate_objective(obj)

        return GBLinearConfig(
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            early_stopping_rounds=self.early_stopping_rounds,
            seed=self.seed,
            objective=obj,
            metric=met,
            l1=self.l1,
            l2=self.l2,
        )

    @abstractmethod
    def _prepare_targets(self, y: NDArray[Any]) -> tuple[NDArray[np.float32], Objective | None]:
        """Prepare targets for training.

        For regressors, this simply casts to float32.
        For classifiers, this performs label encoding.

        Returns:
        -------
        y_prepared : ndarray of shape (n_samples,)
            Prepared targets.
        objective_override : Objective or None
            Objective to use (e.g., softmax for multiclass), or None to use default.
        """
        ...

    @abstractmethod
    def _prepare_eval_targets(self, y: NDArray[Any]) -> NDArray[np.float32]:
        """Prepare evaluation set targets."""
        ...

    def fit(
        self,
        X: NDArray[Any],  # noqa: N803 (sklearn convention for feature matrix)
        y: NDArray[Any],
        eval_set: tuple[NDArray[Any], NDArray[Any]] | None = None,
        sample_weight: NDArray[np.float32] | None = None,
    ) -> Self:
        """Fit the estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training input samples.
        y : array-like of shape (n_samples,)
            Target values.
        eval_set : tuple of (X, y), optional
            Validation set as (X_val, y_val) tuple.
        sample_weight : array-like of shape (n_samples,), optional
            Sample weights.

        Returns:
        -------
        self
            Fitted estimator.
        """
        X, y = check_X_y(X, y, dtype=np.float32)  # noqa: N806 (sklearn convention)
        self.n_features_in_ = X.shape[1]

        # Prepare targets (handles label encoding for classifiers)
        # Also returns objective override for multiclass classification
        y_prepared, objective_override = self._prepare_targets(y)

        # Build config at fit time to respect set_params() changes
        config = self._build_config(objective=objective_override)

        train_data = Dataset(X, y_prepared, weights=sample_weight)
        val_data = self._build_val_set(eval_set)

        self.model_ = GBLinearModel.train(
            train_data,
            config=config,
            val_set=val_data,
            n_threads=self.n_threads,
        )

        return self

    def _build_val_set(self, eval_set: tuple[NDArray[Any], NDArray[Any]] | None) -> Dataset | None:
        """Build validation dataset from user input."""
        if eval_set is None:
            return None

        X_val, y_val = eval_set  # noqa: N806 (sklearn convention)
        X_val = check_array(X_val, dtype=np.float32)  # noqa: N806 (sklearn convention)
        y_val_prepared = self._prepare_eval_targets(y_val)
        return Dataset(X_val, y_val_prepared)

    def predict(self, X: NDArray[Any]) -> NDArray[np.float32]:  # noqa: N803 (sklearn convention)
        """Predict using the fitted model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input samples.

        Returns:
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted values.
        """
        check_is_fitted(self, ["model_"])
        X = check_array(X, dtype=np.float32)  # noqa: N806 (sklearn convention)
        preds: NDArray[np.float32] = self.model_.predict(Dataset(X))
        return np.squeeze(preds, axis=-1)

    @property
    def coef_(self) -> NDArray[np.float32]:
        """Coefficient weights."""
        check_is_fitted(self, ["model_"])
        return self.model_.coef_

    @property
    def intercept_(self) -> NDArray[np.float32]:
        """Intercept (bias) term."""
        check_is_fitted(self, ["model_"])
        return self.model_.intercept_


# =============================================================================
# Regressor
# =============================================================================


[docs] class GBLinearRegressor(_GBLinearEstimatorBase, RegressorMixin): """Gradient Boosted Linear Regressor. A sklearn-compatible wrapper around GBLinearModel for linear regression. Parameters ---------- n_estimators : int, default=100 Number of boosting rounds. learning_rate : float, default=0.5 Step size for weight updates. l1 : float, default=0.0 L1 regularization (alpha). l2 : float, default=1.0 L2 regularization (lambda). early_stopping_rounds : int or None, default=None Stop if no improvement for this many rounds. seed : int, default=42 Random seed. objective : Objective or None, default=None Loss function. Must be a regression objective. If None, uses Objective.squared(). metric : Metric or None, default=None Evaluation metric. If None, uses Metric.rmse(). Attributes: ---------- model_ : GBLinearModel The fitted core model. coef_ : ndarray of shape (n_features,) Coefficient weights. intercept_ : ndarray of shape (1,) Intercept (bias) term. n_features_in_ : int Number of features seen during fit. """ _CLASSIFICATION_KEYWORDS = ("logistic", "softmax", "cross") @classmethod def _get_default_objective(cls) -> Objective: return Objective.squared() @classmethod def _get_default_metric(cls) -> Metric | None: return Metric.rmse() @classmethod def _validate_objective(cls, objective: Objective) -> None: obj_name = str(objective).lower() if any(x in obj_name for x in cls._CLASSIFICATION_KEYWORDS): raise ValueError( f"GBLinearRegressor requires a regression objective, got {objective}. " f"Use Objective.squared(), etc. " f"For classification, use GBLinearClassifier instead." ) def _prepare_targets(self, y: NDArray[Any]) -> tuple[NDArray[np.float32], Objective | None]: """Prepare regression targets.""" return np.asarray(y, dtype=np.float32), None def _prepare_eval_targets(self, y: NDArray[Any]) -> NDArray[np.float32]: """Prepare evaluation set targets for regression.""" return np.asarray(y, dtype=np.float32)
# ============================================================================= # Classifier # =============================================================================
[docs] class GBLinearClassifier(_GBLinearEstimatorBase, ClassifierMixin): """Gradient Boosted Linear Classifier. A sklearn-compatible wrapper around GBLinearModel for classification. Parameters ---------- n_estimators : int, default=100 Number of boosting rounds. learning_rate : float, default=0.5 Step size for weight updates. l1 : float, default=0.0 L1 regularization. l2 : float, default=1.0 L2 regularization. early_stopping_rounds : int or None, default=None Stop if no improvement for this many rounds. seed : int, default=42 Random seed. objective : Objective or None, default=None Loss function. Must be a classification objective. If None, auto-detects: Objective.logistic() for binary, Objective.softmax() for multiclass. metric : Metric or None, default=None Evaluation metric. If None, uses Metric.logloss(). Attributes: ---------- model_ : GBLinearModel The fitted core model. classes_ : ndarray Unique class labels. coef_ : ndarray Coefficient weights. intercept_ : ndarray Intercept terms. """ # Additional instance attributes for classifier classes_: NDArray[Any] n_classes_: int _label_to_idx: Mapping[Any, int] _REGRESSION_KEYWORDS = ( "squared", "absolute", "huber", "quantile", "tweedie", "poisson", "gamma", ) @classmethod def _get_default_objective(cls) -> Objective: return Objective.logistic() @classmethod def _get_default_metric(cls) -> Metric | None: return Metric.logloss() @classmethod def _validate_objective(cls, objective: Objective) -> None: obj_name = str(objective).lower() if any(x in obj_name for x in cls._REGRESSION_KEYWORDS): raise ValueError( f"GBLinearClassifier requires a classification objective, got {objective}. " f"Use Objective.logistic() for binary or Objective.softmax() for multiclass. " f"For regression, use GBLinearRegressor instead." ) def _prepare_targets(self, y: NDArray[Any]) -> tuple[NDArray[np.float32], Objective | None]: """Prepare classification targets with label encoding. Returns softmax objective override for multiclass if user didn't specify. """ self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) self._label_to_idx = {c: i for i, c in enumerate(self.classes_)} # Auto-switch to softmax for multiclass (if user didn't specify objective) objective_override: Objective | None = None if self.n_classes_ > 2 and self.objective is None: objective_override = Objective.softmax(self.n_classes_) y_encoded = np.array([self._label_to_idx[c] for c in y], dtype=np.float32) return y_encoded, objective_override def _prepare_eval_targets(self, y: NDArray[Any]) -> NDArray[np.float32]: """Prepare evaluation set targets with label encoding.""" return np.array([self._label_to_idx[c] for c in y], dtype=np.float32)
[docs] def predict(self, X: NDArray[Any]) -> NDArray[Any]: # noqa: N803 (sklearn convention) """Predict class labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns: ------- y_pred : ndarray of shape (n_samples,) Predicted class labels. """ check_is_fitted(self, ["model_", "classes_"]) proba = self.predict_proba(X) # Binary vs multiclass classification threshold if self.n_classes_ == 2: # noqa: SIM108 (ternary less readable here) indices = (proba[:, 1] >= 0.5).astype(int) else: indices = np.argmax(proba, axis=1) return self.classes_[indices]
[docs] def predict_proba(self, X: NDArray[Any]) -> NDArray[np.float32]: # noqa: N803 (sklearn convention) """Predict class probabilities. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns: ------- proba : ndarray of shape (n_samples, n_classes) Class probability estimates. """ check_is_fitted(self, ["model_"]) X = check_array(X, dtype=np.float32) # noqa: N806 (sklearn convention) preds: NDArray[np.float32] = self.model_.predict(Dataset(X)) if self.n_classes_ == 2: preds_1d = np.squeeze(preds, axis=-1) proba: NDArray[np.float32] = np.column_stack([1 - preds_1d, preds_1d]) else: proba = preds return proba