Expand source code

from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import imodels
import imodels.algebraic.gam_multitask


class ResidualBoostingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, estimator, n_estimators=10):
        """
        A meta-estimator that fits a base estimator to the residuals of the
        previous estimators.

        Parameters:
        - estimator: The estimator to fit on the residual of the previous step.
        - n_estimators: The number of estimators to fit.
        """
        self.estimator = estimator
        self.n_estimators = n_estimators

    def fit(self, X, y):
        """
        Fit the ensemble of base estimators on the training data.

        Parameters:
        - X: array-like of shape (n_samples, n_features)
            Training data.
        - y: array-like of shape (n_samples,)
            Target values.

        Returns:
        - self: object
        """
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)

        self.estimators_ = []
        current_prediction = np.zeros(y.shape)

        for _ in range(self.n_estimators):
            residual = y - current_prediction
            estimator = clone(self.estimator)
            estimator.fit(X, residual)
            self.estimators_.append(estimator)
            current_prediction += estimator.predict(X)

        return self

    def predict(self, X):
        """
        Predict regression target for X.

        Parameters:
        - X: array-like of shape (n_samples, n_features)
            The input samples.

        Returns:
        - y_pred: ndarray of shape (n_samples,)
            The predicted values.
        """
        # Check is fit had been called
        check_is_fitted(self)
        # Input validation
        X = check_array(X)

        predictions = sum(estimator.predict(X)
                          for estimator in self.estimators_)
        return predictions


class SimpleBaggingRegressor:
    def __init__(self, estimator, n_estimators=10, random_state=None):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.random_state = random_state

    def fit(self, X, y):
        np.random.seed(self.random_state)
        self.estimators_ = []

        rng = np.random.default_rng(self.random_state)
        for _ in range(self.n_estimators):
            # Simple bootstrap sampling
            # sample_indices = np.random.choice(
            # range(X.shape[0]), size=X.shape[0], replace=True)
            sample_indices = rng.choice(
                range(X.shape[0]), size=X.shape[0], replace=True)
            X_sample = X[sample_indices]
            y_sample = y[sample_indices]

            # Fit a base estimator
            # estimator = DecisionTreeRegressor()
            estimator = clone(self.estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators_.append(estimator)

    def predict(self, X):
        # Collect predictions from each base estimator
        predictions = np.array([estimator.predict(X)
                               for estimator in self.estimators_])

        # Aggregate predictions
        return np.mean(predictions, axis=0)


if __name__ == '__main__':
    X, y, feature_names = imodels.get_clean_dataset('california_housing')
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    X_train = X_train[:50, :2]
    y_train = y_train[:50]
    X_test = X_test[:50, :2]
    y_test = y_test[:50]
    # estimator = DecisionTreeRegressor(max_depth=3)
    estimator = imodels.algebraic.gam_multitask.MultiTaskGAMRegressor()
    for n_estimators in [1, 3, 5]:
        # residual_boosting_regressor = ResidualBoostingRegressor(
        # estimator=estimator, n_estimators=n_estimators)
        residual_boosting_regressor = SimpleBaggingRegressor(
            estimator=estimator, n_estimators=n_estimators)
        residual_boosting_regressor.fit(X_train, y_train)

        y_pred = residual_boosting_regressor.predict(X_test)
        mse_train = mean_squared_error(
            y_train, residual_boosting_regressor.predict(X_train))
        mse = mean_squared_error(y_test, y_pred)
        print(
            f'MSE with {n_estimators} estimators: {mse:.2f} (train: {mse_train:.2f})')

Classes

class ResidualBoostingRegressor (estimator, n_estimators=10)

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

A meta-estimator that fits a base estimator to the residuals of the previous estimators.

Parameters: - estimator: The estimator to fit on the residual of the previous step. - n_estimators: The number of estimators to fit.

Expand source code

class ResidualBoostingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, estimator, n_estimators=10):
        """
        A meta-estimator that fits a base estimator to the residuals of the
        previous estimators.

        Parameters:
        - estimator: The estimator to fit on the residual of the previous step.
        - n_estimators: The number of estimators to fit.
        """
        self.estimator = estimator
        self.n_estimators = n_estimators

    def fit(self, X, y):
        """
        Fit the ensemble of base estimators on the training data.

        Parameters:
        - X: array-like of shape (n_samples, n_features)
            Training data.
        - y: array-like of shape (n_samples,)
            Target values.

        Returns:
        - self: object
        """
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)

        self.estimators_ = []
        current_prediction = np.zeros(y.shape)

        for _ in range(self.n_estimators):
            residual = y - current_prediction
            estimator = clone(self.estimator)
            estimator.fit(X, residual)
            self.estimators_.append(estimator)
            current_prediction += estimator.predict(X)

        return self

    def predict(self, X):
        """
        Predict regression target for X.

        Parameters:
        - X: array-like of shape (n_samples, n_features)
            The input samples.

        Returns:
        - y_pred: ndarray of shape (n_samples,)
            The predicted values.
        """
        # Check is fit had been called
        check_is_fitted(self)
        # Input validation
        X = check_array(X)

        predictions = sum(estimator.predict(X)
                          for estimator in self.estimators_)
        return predictions

Ancestors

sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester
sklearn.base.RegressorMixin

Methods

def fit(self, X, y)

Fit the ensemble of base estimators on the training data.

Parameters: - X: array-like of shape (n_samples, n_features) Training data. - y: array-like of shape (n_samples,) Target values.

Returns: - self: object

Expand source code

def fit(self, X, y):
    """
    Fit the ensemble of base estimators on the training data.

    Parameters:
    - X: array-like of shape (n_samples, n_features)
        Training data.
    - y: array-like of shape (n_samples,)
        Target values.

    Returns:
    - self: object
    """
    # Check that X and y have correct shape
    X, y = check_X_y(X, y)

    self.estimators_ = []
    current_prediction = np.zeros(y.shape)

    for _ in range(self.n_estimators):
        residual = y - current_prediction
        estimator = clone(self.estimator)
        estimator.fit(X, residual)
        self.estimators_.append(estimator)
        current_prediction += estimator.predict(X)

    return self

def predict(self, X)

Predict regression target for X.

Parameters: - X: array-like of shape (n_samples, n_features) The input samples.

Returns: - y_pred: ndarray of shape (n_samples,) The predicted values.

Expand source code

def predict(self, X):
    """
    Predict regression target for X.

    Parameters:
    - X: array-like of shape (n_samples, n_features)
        The input samples.

    Returns:
    - y_pred: ndarray of shape (n_samples,)
        The predicted values.
    """
    # Check is fit had been called
    check_is_fitted(self)
    # Input validation
    X = check_array(X)

    predictions = sum(estimator.predict(X)
                      for estimator in self.estimators_)
    return predictions

def set_score_request(self: ResidualBoostingRegressor, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> ResidualBoostingRegressor

Request metadata passed to the score method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to score if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to score.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

sample_weight : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for sample_weight parameter in score.

Returns

self : object: The updated object.

Expand source code

def func(*args, **kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

class SimpleBaggingRegressor (estimator, n_estimators=10, random_state=None)

Expand source code

class SimpleBaggingRegressor:
    def __init__(self, estimator, n_estimators=10, random_state=None):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.random_state = random_state

    def fit(self, X, y):
        np.random.seed(self.random_state)
        self.estimators_ = []

        rng = np.random.default_rng(self.random_state)
        for _ in range(self.n_estimators):
            # Simple bootstrap sampling
            # sample_indices = np.random.choice(
            # range(X.shape[0]), size=X.shape[0], replace=True)
            sample_indices = rng.choice(
                range(X.shape[0]), size=X.shape[0], replace=True)
            X_sample = X[sample_indices]
            y_sample = y[sample_indices]

            # Fit a base estimator
            # estimator = DecisionTreeRegressor()
            estimator = clone(self.estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators_.append(estimator)

    def predict(self, X):
        # Collect predictions from each base estimator
        predictions = np.array([estimator.predict(X)
                               for estimator in self.estimators_])

        # Aggregate predictions
        return np.mean(predictions, axis=0)

Methods

def fit(self, X, y)

Expand source code

def fit(self, X, y):
    np.random.seed(self.random_state)
    self.estimators_ = []

    rng = np.random.default_rng(self.random_state)
    for _ in range(self.n_estimators):
        # Simple bootstrap sampling
        # sample_indices = np.random.choice(
        # range(X.shape[0]), size=X.shape[0], replace=True)
        sample_indices = rng.choice(
            range(X.shape[0]), size=X.shape[0], replace=True)
        X_sample = X[sample_indices]
        y_sample = y[sample_indices]

        # Fit a base estimator
        # estimator = DecisionTreeRegressor()
        estimator = clone(self.estimator)
        estimator.fit(X_sample, y_sample)
        self.estimators_.append(estimator)

def predict(self, X)

Expand source code

def predict(self, X):
    # Collect predictions from each base estimator
    predictions = np.array([estimator.predict(X)
                           for estimator in self.estimators_])

    # Aggregate predictions
    return np.mean(predictions, axis=0)