Expand source code

from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y
from sklearn.utils.validation import _check_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm

import imodels

from sklearn.base import RegressorMixin, ClassifierMixin


class TreeGAMSimple(BaseEstimator):
    """Tree-based GAM classifier.
    Uses cyclical boosting to fit a GAM with small trees.
    Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret
    Only works for binary classification.
    Fits a scalar bias to the mean.
    """

    def __init__(
        self,
        n_boosting_rounds=100,
        max_leaf_nodes=3,
        learning_rate: float = 0.01,
        boosting_strategy="cyclic",
        validation_frac=0.15,
        random_state=None,
    ):
        """
        Params
        ------
        n_boosting_rounds : int
            Number of boosting rounds for the cyclic boosting.
        max_leaf_nodes : int
            Maximum number of leaf nodes for the trees in the cyclic boosting.
        learning_rate: float
            Learning rate for the cyclic boosting.
        boosting_strategy : str ["cyclic", "greedy"]
            Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step)
        validation_frac: float
            Fraction of data to use for early stopping.
        random_state : int
            Random seed.
        """
        self.n_boosting_rounds = n_boosting_rounds
        self.max_leaf_nodes = max_leaf_nodes
        self.learning_rate = learning_rate
        self.boosting_strategy = boosting_strategy
        self.validation_frac = validation_frac
        self.random_state = random_state

    def fit(self, X, y, sample_weight=None):
        X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
        if isinstance(self, ClassifierMixin):
            check_classification_targets(y)
            self.classes_, y = np.unique(y, return_inverse=True)

        sample_weight = _check_sample_weight(sample_weight, X, dtype=None)

        # split into train and validation for early stopping
        (
            X_train,
            X_val,
            y_train,
            y_val,
            sample_weight_train,
            sample_weight_val,
        ) = train_test_split(
            X,
            y,
            sample_weight,
            test_size=self.validation_frac,
            random_state=self.random_state,
            stratify=y if isinstance(self, ClassifierMixin) else None,
        )

        self.estimators_ = []
        self.bias_ = np.mean(y)

        self._cyclic_boost(
            X_train,
            y_train,
            sample_weight_train,
            X_val,
            y_val,
            sample_weight_val,
        )

        self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val)

        return self

    def _cyclic_boost(
        self, X_train, y_train, sample_weight_train, X_val, y_val, sample_weight_val
    ):
        """Apply cyclic boosting, storing trees in self.estimators_"""

        residuals_train = y_train - self.predict_proba(X_train)[:, 1]
        mse_val = self._calc_mse(X_val, y_val, sample_weight_val)
        for _ in range(self.n_boosting_rounds):
            boosting_round_ests = []
            boosting_round_mses = []
            for feature_num in range(X_train.shape[1]):
                X_ = np.zeros_like(X_train)
                X_[:, feature_num] = X_train[:, feature_num]
                est = DecisionTreeRegressor(
                    max_leaf_nodes=self.max_leaf_nodes,
                    random_state=self.random_state,
                )
                est.fit(X_, residuals_train, sample_weight=sample_weight_train)
                succesfully_split_on_feature = np.all(
                    (est.tree_.feature[0] == feature_num) | (
                        est.tree_.feature[0] == -2)
                )
                if not succesfully_split_on_feature:
                    continue
                self.estimators_.append(est)
                residuals_train_new = (
                    residuals_train - self.learning_rate * est.predict(X_train)
                )
                if self.boosting_strategy == "cyclic":
                    residuals_train = residuals_train_new
                elif self.boosting_strategy == "greedy":
                    mse_train_new = self._calc_mse(
                        X_train, y_train, sample_weight_train
                    )
                    # don't add each estimator for greedy
                    boosting_round_ests.append(
                        deepcopy(self.estimators_.pop()))
                    boosting_round_mses.append(mse_train_new)

            if self.boosting_strategy == "greedy":
                best_est = boosting_round_ests[np.argmin(boosting_round_mses)]
                self.estimators_.append(best_est)
                residuals_train = (
                    residuals_train - self.learning_rate *
                    best_est.predict(X_train)
                )

            # early stopping if validation error does not decrease
            mse_val_new = self._calc_mse(X_val, y_val, sample_weight_val)
            if mse_val_new >= mse_val:
                return
            else:
                mse_val = mse_val_new

    def predict_proba(self, X):
        X = check_array(X, accept_sparse=False, dtype=None)
        check_is_fitted(self)
        probs1 = np.ones(X.shape[0]) * self.bias_
        for i, est in enumerate(self.estimators_):
            probs1 += est.predict(X) * self.learning_rate
        probs1 = np.clip(probs1, a_min=0, a_max=1)
        return np.array([1 - probs1, probs1]).T

    def predict(self, X):
        if isinstance(self, RegressorMixin):
            return self.predict_proba(X)[:, 1]
        elif isinstance(self, ClassifierMixin):
            return np.argmax(self.predict_proba(X), axis=1)

    def _calc_mse(self, X, y, sample_weight=None):
        return np.average(
            np.square(y - self.predict_proba(X)[:, 1]),
            weights=sample_weight,
        )


class TreeGAMSimpleRegressor(TreeGAMSimple, RegressorMixin):
    ...


class TreeGAMSimpleClassifier(TreeGAMSimple, ClassifierMixin):
    ...


if __name__ == "__main__":
    X, y, feature_names = imodels.get_clean_dataset("heart")
    X, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    gam = TreeGAMSimpleClassifier(
        boosting_strategy="cyclic",
        random_state=42,
        learning_rate=0.1,
        max_leaf_nodes=3,
        n_boosting_rounds=100,
    )
    gam.fit(X, y_train)

    # check roc auc score
    y_pred = gam.predict_proba(X_test)[:, 1]
    print(
        "train roc:",
        roc_auc_score(y_train, gam.predict_proba(X)[:, 1]).round(3),
    )
    print("test roc:", roc_auc_score(y_test, y_pred).round(3))
    print("test acc:", accuracy_score(y_test, gam.predict(X_test)).round(3))
    print('\t(imb:', np.mean(y_test).round(3), ')')

Classes

class TreeGAMSimple (n_boosting_rounds=100, max_leaf_nodes=3, learning_rate: float = 0.01, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)

Tree-based GAM classifier. Uses cyclical boosting to fit a GAM with small trees. Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret Only works for binary classification. Fits a scalar bias to the mean.

Params

n_boosting_rounds : int Number of boosting rounds for the cyclic boosting. max_leaf_nodes : int Maximum number of leaf nodes for the trees in the cyclic boosting. learning_rate: float Learning rate for the cyclic boosting. boosting_strategy : str ["cyclic", "greedy"] Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step) validation_frac: float Fraction of data to use for early stopping. random_state : int Random seed.

Expand source code

class TreeGAMSimple(BaseEstimator):
    """Tree-based GAM classifier.
    Uses cyclical boosting to fit a GAM with small trees.
    Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret
    Only works for binary classification.
    Fits a scalar bias to the mean.
    """

    def __init__(
        self,
        n_boosting_rounds=100,
        max_leaf_nodes=3,
        learning_rate: float = 0.01,
        boosting_strategy="cyclic",
        validation_frac=0.15,
        random_state=None,
    ):
        """
        Params
        ------
        n_boosting_rounds : int
            Number of boosting rounds for the cyclic boosting.
        max_leaf_nodes : int
            Maximum number of leaf nodes for the trees in the cyclic boosting.
        learning_rate: float
            Learning rate for the cyclic boosting.
        boosting_strategy : str ["cyclic", "greedy"]
            Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step)
        validation_frac: float
            Fraction of data to use for early stopping.
        random_state : int
            Random seed.
        """
        self.n_boosting_rounds = n_boosting_rounds
        self.max_leaf_nodes = max_leaf_nodes
        self.learning_rate = learning_rate
        self.boosting_strategy = boosting_strategy
        self.validation_frac = validation_frac
        self.random_state = random_state

    def fit(self, X, y, sample_weight=None):
        X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
        if isinstance(self, ClassifierMixin):
            check_classification_targets(y)
            self.classes_, y = np.unique(y, return_inverse=True)

        sample_weight = _check_sample_weight(sample_weight, X, dtype=None)

        # split into train and validation for early stopping
        (
            X_train,
            X_val,
            y_train,
            y_val,
            sample_weight_train,
            sample_weight_val,
        ) = train_test_split(
            X,
            y,
            sample_weight,
            test_size=self.validation_frac,
            random_state=self.random_state,
            stratify=y if isinstance(self, ClassifierMixin) else None,
        )

        self.estimators_ = []
        self.bias_ = np.mean(y)

        self._cyclic_boost(
            X_train,
            y_train,
            sample_weight_train,
            X_val,
            y_val,
            sample_weight_val,
        )

        self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val)

        return self

    def _cyclic_boost(
        self, X_train, y_train, sample_weight_train, X_val, y_val, sample_weight_val
    ):
        """Apply cyclic boosting, storing trees in self.estimators_"""

        residuals_train = y_train - self.predict_proba(X_train)[:, 1]
        mse_val = self._calc_mse(X_val, y_val, sample_weight_val)
        for _ in range(self.n_boosting_rounds):
            boosting_round_ests = []
            boosting_round_mses = []
            for feature_num in range(X_train.shape[1]):
                X_ = np.zeros_like(X_train)
                X_[:, feature_num] = X_train[:, feature_num]
                est = DecisionTreeRegressor(
                    max_leaf_nodes=self.max_leaf_nodes,
                    random_state=self.random_state,
                )
                est.fit(X_, residuals_train, sample_weight=sample_weight_train)
                succesfully_split_on_feature = np.all(
                    (est.tree_.feature[0] == feature_num) | (
                        est.tree_.feature[0] == -2)
                )
                if not succesfully_split_on_feature:
                    continue
                self.estimators_.append(est)
                residuals_train_new = (
                    residuals_train - self.learning_rate * est.predict(X_train)
                )
                if self.boosting_strategy == "cyclic":
                    residuals_train = residuals_train_new
                elif self.boosting_strategy == "greedy":
                    mse_train_new = self._calc_mse(
                        X_train, y_train, sample_weight_train
                    )
                    # don't add each estimator for greedy
                    boosting_round_ests.append(
                        deepcopy(self.estimators_.pop()))
                    boosting_round_mses.append(mse_train_new)

            if self.boosting_strategy == "greedy":
                best_est = boosting_round_ests[np.argmin(boosting_round_mses)]
                self.estimators_.append(best_est)
                residuals_train = (
                    residuals_train - self.learning_rate *
                    best_est.predict(X_train)
                )

            # early stopping if validation error does not decrease
            mse_val_new = self._calc_mse(X_val, y_val, sample_weight_val)
            if mse_val_new >= mse_val:
                return
            else:
                mse_val = mse_val_new

    def predict_proba(self, X):
        X = check_array(X, accept_sparse=False, dtype=None)
        check_is_fitted(self)
        probs1 = np.ones(X.shape[0]) * self.bias_
        for i, est in enumerate(self.estimators_):
            probs1 += est.predict(X) * self.learning_rate
        probs1 = np.clip(probs1, a_min=0, a_max=1)
        return np.array([1 - probs1, probs1]).T

    def predict(self, X):
        if isinstance(self, RegressorMixin):
            return self.predict_proba(X)[:, 1]
        elif isinstance(self, ClassifierMixin):
            return np.argmax(self.predict_proba(X), axis=1)

    def _calc_mse(self, X, y, sample_weight=None):
        return np.average(
            np.square(y - self.predict_proba(X)[:, 1]),
            weights=sample_weight,
        )

Ancestors

sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester

Methods

def fit(self, X, y, sample_weight=None)

Expand source code

def fit(self, X, y, sample_weight=None):
    X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
    if isinstance(self, ClassifierMixin):
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)

    sample_weight = _check_sample_weight(sample_weight, X, dtype=None)

    # split into train and validation for early stopping
    (
        X_train,
        X_val,
        y_train,
        y_val,
        sample_weight_train,
        sample_weight_val,
    ) = train_test_split(
        X,
        y,
        sample_weight,
        test_size=self.validation_frac,
        random_state=self.random_state,
        stratify=y if isinstance(self, ClassifierMixin) else None,
    )

    self.estimators_ = []
    self.bias_ = np.mean(y)

    self._cyclic_boost(
        X_train,
        y_train,
        sample_weight_train,
        X_val,
        y_val,
        sample_weight_val,
    )

    self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val)

    return self

def predict(self, X)

Expand source code

def predict(self, X):
    if isinstance(self, RegressorMixin):
        return self.predict_proba(X)[:, 1]
    elif isinstance(self, ClassifierMixin):
        return np.argmax(self.predict_proba(X), axis=1)

def predict_proba(self, X)

Expand source code

def predict_proba(self, X):
    X = check_array(X, accept_sparse=False, dtype=None)
    check_is_fitted(self)
    probs1 = np.ones(X.shape[0]) * self.bias_
    for i, est in enumerate(self.estimators_):
        probs1 += est.predict(X) * self.learning_rate
    probs1 = np.clip(probs1, a_min=0, a_max=1)
    return np.array([1 - probs1, probs1]).T

def set_fit_request(self: TreeGAMSimple, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> TreeGAMSimple

Request metadata passed to the fit method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to fit if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to fit.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

sample_weight : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for sample_weight parameter in fit.

Returns

self : object: The updated object.

Expand source code

def func(*args, **kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

class TreeGAMSimpleClassifier (n_boosting_rounds=100, max_leaf_nodes=3, learning_rate: float = 0.01, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)

Params

Expand source code

class TreeGAMSimpleClassifier(TreeGAMSimple, ClassifierMixin):
    ...

Ancestors

TreeGAMSimple
sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester
sklearn.base.ClassifierMixin

Methods

def set_score_request(self: TreeGAMSimpleClassifier, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> TreeGAMSimpleClassifier

Request metadata passed to the score method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to score if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to score.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

sample_weight : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for sample_weight parameter in score.

Returns

self : object: The updated object.

Expand source code

def func(*args, **kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

Inherited members

TreeGAMSimple:
- set_fit_request

class TreeGAMSimpleRegressor (n_boosting_rounds=100, max_leaf_nodes=3, learning_rate: float = 0.01, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)

Params

Expand source code

class TreeGAMSimpleRegressor(TreeGAMSimple, RegressorMixin):
    ...

Ancestors

TreeGAMSimple
sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester
sklearn.base.RegressorMixin

Methods

def set_score_request(self: TreeGAMSimpleRegressor, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> TreeGAMSimpleRegressor

Request metadata passed to the score method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to score if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to score.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

sample_weight : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for sample_weight parameter in score.

Returns

self : object: The updated object.

Expand source code

def func(*args, **kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

Inherited members

TreeGAMSimple:
- set_fit_request