Expand source code
from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y
from sklearn.utils.validation import _check_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm
import imodels
from sklearn.base import RegressorMixin, ClassifierMixin
class TreeGAMSimple(BaseEstimator):
"""Tree-based GAM classifier.
Uses cyclical boosting to fit a GAM with small trees.
Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret
Only works for binary classification.
Fits a scalar bias to the mean.
"""
def __init__(
self,
n_boosting_rounds=100,
max_leaf_nodes=3,
learning_rate: float = 0.01,
boosting_strategy="cyclic",
validation_frac=0.15,
random_state=None,
):
"""
Params
------
n_boosting_rounds : int
Number of boosting rounds for the cyclic boosting.
max_leaf_nodes : int
Maximum number of leaf nodes for the trees in the cyclic boosting.
learning_rate: float
Learning rate for the cyclic boosting.
boosting_strategy : str ["cyclic", "greedy"]
Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step)
validation_frac: float
Fraction of data to use for early stopping.
random_state : int
Random seed.
"""
self.n_boosting_rounds = n_boosting_rounds
self.max_leaf_nodes = max_leaf_nodes
self.learning_rate = learning_rate
self.boosting_strategy = boosting_strategy
self.validation_frac = validation_frac
self.random_state = random_state
def fit(self, X, y, sample_weight=None):
X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
if isinstance(self, ClassifierMixin):
check_classification_targets(y)
self.classes_, y = np.unique(y, return_inverse=True)
sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
# split into train and validation for early stopping
(
X_train,
X_val,
y_train,
y_val,
sample_weight_train,
sample_weight_val,
) = train_test_split(
X,
y,
sample_weight,
test_size=self.validation_frac,
random_state=self.random_state,
stratify=y if isinstance(self, ClassifierMixin) else None,
)
self.estimators_ = []
self.bias_ = np.mean(y)
self._cyclic_boost(
X_train,
y_train,
sample_weight_train,
X_val,
y_val,
sample_weight_val,
)
self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val)
return self
def _cyclic_boost(
self, X_train, y_train, sample_weight_train, X_val, y_val, sample_weight_val
):
"""Apply cyclic boosting, storing trees in self.estimators_"""
residuals_train = y_train - self.predict_proba(X_train)[:, 1]
mse_val = self._calc_mse(X_val, y_val, sample_weight_val)
for _ in range(self.n_boosting_rounds):
boosting_round_ests = []
boosting_round_mses = []
for feature_num in range(X_train.shape[1]):
X_ = np.zeros_like(X_train)
X_[:, feature_num] = X_train[:, feature_num]
est = DecisionTreeRegressor(
max_leaf_nodes=self.max_leaf_nodes,
random_state=self.random_state,
)
est.fit(X_, residuals_train, sample_weight=sample_weight_train)
succesfully_split_on_feature = np.all(
(est.tree_.feature[0] == feature_num) | (
est.tree_.feature[0] == -2)
)
if not succesfully_split_on_feature:
continue
self.estimators_.append(est)
residuals_train_new = (
residuals_train - self.learning_rate * est.predict(X_train)
)
if self.boosting_strategy == "cyclic":
residuals_train = residuals_train_new
elif self.boosting_strategy == "greedy":
mse_train_new = self._calc_mse(
X_train, y_train, sample_weight_train
)
# don't add each estimator for greedy
boosting_round_ests.append(
deepcopy(self.estimators_.pop()))
boosting_round_mses.append(mse_train_new)
if self.boosting_strategy == "greedy":
best_est = boosting_round_ests[np.argmin(boosting_round_mses)]
self.estimators_.append(best_est)
residuals_train = (
residuals_train - self.learning_rate *
best_est.predict(X_train)
)
# early stopping if validation error does not decrease
mse_val_new = self._calc_mse(X_val, y_val, sample_weight_val)
if mse_val_new >= mse_val:
return
else:
mse_val = mse_val_new
def predict_proba(self, X):
X = check_array(X, accept_sparse=False, dtype=None)
check_is_fitted(self)
probs1 = np.ones(X.shape[0]) * self.bias_
for i, est in enumerate(self.estimators_):
probs1 += est.predict(X) * self.learning_rate
probs1 = np.clip(probs1, a_min=0, a_max=1)
return np.array([1 - probs1, probs1]).T
def predict(self, X):
if isinstance(self, RegressorMixin):
return self.predict_proba(X)[:, 1]
elif isinstance(self, ClassifierMixin):
return np.argmax(self.predict_proba(X), axis=1)
def _calc_mse(self, X, y, sample_weight=None):
return np.average(
np.square(y - self.predict_proba(X)[:, 1]),
weights=sample_weight,
)
class TreeGAMSimpleRegressor(TreeGAMSimple, RegressorMixin):
...
class TreeGAMSimpleClassifier(TreeGAMSimple, ClassifierMixin):
...
if __name__ == "__main__":
X, y, feature_names = imodels.get_clean_dataset("heart")
X, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
gam = TreeGAMSimpleClassifier(
boosting_strategy="cyclic",
random_state=42,
learning_rate=0.1,
max_leaf_nodes=3,
n_boosting_rounds=100,
)
gam.fit(X, y_train)
# check roc auc score
y_pred = gam.predict_proba(X_test)[:, 1]
print(
"train roc:",
roc_auc_score(y_train, gam.predict_proba(X)[:, 1]).round(3),
)
print("test roc:", roc_auc_score(y_test, y_pred).round(3))
print("test acc:", accuracy_score(y_test, gam.predict(X_test)).round(3))
print('\t(imb:', np.mean(y_test).round(3), ')')
Classes
class TreeGAMSimple (n_boosting_rounds=100, max_leaf_nodes=3, learning_rate: float = 0.01, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)
-
Tree-based GAM classifier. Uses cyclical boosting to fit a GAM with small trees. Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret Only works for binary classification. Fits a scalar bias to the mean.
Params
n_boosting_rounds : int Number of boosting rounds for the cyclic boosting. max_leaf_nodes : int Maximum number of leaf nodes for the trees in the cyclic boosting. learning_rate: float Learning rate for the cyclic boosting. boosting_strategy : str ["cyclic", "greedy"] Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step) validation_frac: float Fraction of data to use for early stopping. random_state : int Random seed.
Expand source code
class TreeGAMSimple(BaseEstimator): """Tree-based GAM classifier. Uses cyclical boosting to fit a GAM with small trees. Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret Only works for binary classification. Fits a scalar bias to the mean. """ def __init__( self, n_boosting_rounds=100, max_leaf_nodes=3, learning_rate: float = 0.01, boosting_strategy="cyclic", validation_frac=0.15, random_state=None, ): """ Params ------ n_boosting_rounds : int Number of boosting rounds for the cyclic boosting. max_leaf_nodes : int Maximum number of leaf nodes for the trees in the cyclic boosting. learning_rate: float Learning rate for the cyclic boosting. boosting_strategy : str ["cyclic", "greedy"] Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step) validation_frac: float Fraction of data to use for early stopping. random_state : int Random seed. """ self.n_boosting_rounds = n_boosting_rounds self.max_leaf_nodes = max_leaf_nodes self.learning_rate = learning_rate self.boosting_strategy = boosting_strategy self.validation_frac = validation_frac self.random_state = random_state def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=False, multi_output=False) if isinstance(self, ClassifierMixin): check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=None) # split into train and validation for early stopping ( X_train, X_val, y_train, y_val, sample_weight_train, sample_weight_val, ) = train_test_split( X, y, sample_weight, test_size=self.validation_frac, random_state=self.random_state, stratify=y if isinstance(self, ClassifierMixin) else None, ) self.estimators_ = [] self.bias_ = np.mean(y) self._cyclic_boost( X_train, y_train, sample_weight_train, X_val, y_val, sample_weight_val, ) self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val) return self def _cyclic_boost( self, X_train, y_train, sample_weight_train, X_val, y_val, sample_weight_val ): """Apply cyclic boosting, storing trees in self.estimators_""" residuals_train = y_train - self.predict_proba(X_train)[:, 1] mse_val = self._calc_mse(X_val, y_val, sample_weight_val) for _ in range(self.n_boosting_rounds): boosting_round_ests = [] boosting_round_mses = [] for feature_num in range(X_train.shape[1]): X_ = np.zeros_like(X_train) X_[:, feature_num] = X_train[:, feature_num] est = DecisionTreeRegressor( max_leaf_nodes=self.max_leaf_nodes, random_state=self.random_state, ) est.fit(X_, residuals_train, sample_weight=sample_weight_train) succesfully_split_on_feature = np.all( (est.tree_.feature[0] == feature_num) | ( est.tree_.feature[0] == -2) ) if not succesfully_split_on_feature: continue self.estimators_.append(est) residuals_train_new = ( residuals_train - self.learning_rate * est.predict(X_train) ) if self.boosting_strategy == "cyclic": residuals_train = residuals_train_new elif self.boosting_strategy == "greedy": mse_train_new = self._calc_mse( X_train, y_train, sample_weight_train ) # don't add each estimator for greedy boosting_round_ests.append( deepcopy(self.estimators_.pop())) boosting_round_mses.append(mse_train_new) if self.boosting_strategy == "greedy": best_est = boosting_round_ests[np.argmin(boosting_round_mses)] self.estimators_.append(best_est) residuals_train = ( residuals_train - self.learning_rate * best_est.predict(X_train) ) # early stopping if validation error does not decrease mse_val_new = self._calc_mse(X_val, y_val, sample_weight_val) if mse_val_new >= mse_val: return else: mse_val = mse_val_new def predict_proba(self, X): X = check_array(X, accept_sparse=False, dtype=None) check_is_fitted(self) probs1 = np.ones(X.shape[0]) * self.bias_ for i, est in enumerate(self.estimators_): probs1 += est.predict(X) * self.learning_rate probs1 = np.clip(probs1, a_min=0, a_max=1) return np.array([1 - probs1, probs1]).T def predict(self, X): if isinstance(self, RegressorMixin): return self.predict_proba(X)[:, 1] elif isinstance(self, ClassifierMixin): return np.argmax(self.predict_proba(X), axis=1) def _calc_mse(self, X, y, sample_weight=None): return np.average( np.square(y - self.predict_proba(X)[:, 1]), weights=sample_weight, )
Ancestors
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
Subclasses
Methods
def fit(self, X, y, sample_weight=None)
-
Expand source code
def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=False, multi_output=False) if isinstance(self, ClassifierMixin): check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=None) # split into train and validation for early stopping ( X_train, X_val, y_train, y_val, sample_weight_train, sample_weight_val, ) = train_test_split( X, y, sample_weight, test_size=self.validation_frac, random_state=self.random_state, stratify=y if isinstance(self, ClassifierMixin) else None, ) self.estimators_ = [] self.bias_ = np.mean(y) self._cyclic_boost( X_train, y_train, sample_weight_train, X_val, y_val, sample_weight_val, ) self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val) return self
def predict(self, X)
-
Expand source code
def predict(self, X): if isinstance(self, RegressorMixin): return self.predict_proba(X)[:, 1] elif isinstance(self, ClassifierMixin): return np.argmax(self.predict_proba(X), axis=1)
def predict_proba(self, X)
-
Expand source code
def predict_proba(self, X): X = check_array(X, accept_sparse=False, dtype=None) check_is_fitted(self) probs1 = np.ones(X.shape[0]) * self.bias_ for i, est in enumerate(self.estimators_): probs1 += est.predict(X) * self.learning_rate probs1 = np.clip(probs1, a_min=0, a_max=1) return np.array([1 - probs1, probs1]).T
def set_fit_request(self: TreeGAMSimple, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> TreeGAMSimple
-
Request metadata passed to the
fit
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed tofit
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it tofit
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
sample_weight
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
sample_weight
parameter infit
.
Returns
self
:object
- The updated object.
Expand source code
def func(*args, **kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. " f"Accepted arguments are: {set(self.keys)}" ) # This makes it possible to use the decorated method as an unbound method, # for instance when monkeypatching. # https://github.com/scikit-learn/scikit-learn/issues/28632 if instance is None: _instance = args[0] args = args[1:] else: _instance = instance # Replicating python's behavior when positional args are given other than # `self`, and `self` is only allowed if this method is unbound. if args: raise TypeError( f"set_{self.name}_request() takes 0 positional argument but" f" {len(args)} were given" ) requests = _instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) _instance._metadata_request = requests return _instance
-
class TreeGAMSimpleClassifier (n_boosting_rounds=100, max_leaf_nodes=3, learning_rate: float = 0.01, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)
-
Tree-based GAM classifier. Uses cyclical boosting to fit a GAM with small trees. Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret Only works for binary classification. Fits a scalar bias to the mean.
Params
n_boosting_rounds : int Number of boosting rounds for the cyclic boosting. max_leaf_nodes : int Maximum number of leaf nodes for the trees in the cyclic boosting. learning_rate: float Learning rate for the cyclic boosting. boosting_strategy : str ["cyclic", "greedy"] Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step) validation_frac: float Fraction of data to use for early stopping. random_state : int Random seed.
Expand source code
class TreeGAMSimpleClassifier(TreeGAMSimple, ClassifierMixin): ...
Ancestors
- TreeGAMSimple
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.ClassifierMixin
Methods
def set_score_request(self: TreeGAMSimpleClassifier, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> TreeGAMSimpleClassifier
-
Request metadata passed to the
score
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed toscore
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it toscore
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
sample_weight
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
sample_weight
parameter inscore
.
Returns
self
:object
- The updated object.
Expand source code
def func(*args, **kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. " f"Accepted arguments are: {set(self.keys)}" ) # This makes it possible to use the decorated method as an unbound method, # for instance when monkeypatching. # https://github.com/scikit-learn/scikit-learn/issues/28632 if instance is None: _instance = args[0] args = args[1:] else: _instance = instance # Replicating python's behavior when positional args are given other than # `self`, and `self` is only allowed if this method is unbound. if args: raise TypeError( f"set_{self.name}_request() takes 0 positional argument but" f" {len(args)} were given" ) requests = _instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) _instance._metadata_request = requests return _instance
-
Inherited members
class TreeGAMSimpleRegressor (n_boosting_rounds=100, max_leaf_nodes=3, learning_rate: float = 0.01, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)
-
Tree-based GAM classifier. Uses cyclical boosting to fit a GAM with small trees. Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret Only works for binary classification. Fits a scalar bias to the mean.
Params
n_boosting_rounds : int Number of boosting rounds for the cyclic boosting. max_leaf_nodes : int Maximum number of leaf nodes for the trees in the cyclic boosting. learning_rate: float Learning rate for the cyclic boosting. boosting_strategy : str ["cyclic", "greedy"] Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step) validation_frac: float Fraction of data to use for early stopping. random_state : int Random seed.
Expand source code
class TreeGAMSimpleRegressor(TreeGAMSimple, RegressorMixin): ...
Ancestors
- TreeGAMSimple
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.RegressorMixin
Methods
def set_score_request(self: TreeGAMSimpleRegressor, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> TreeGAMSimpleRegressor
-
Request metadata passed to the
score
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed toscore
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it toscore
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
sample_weight
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
sample_weight
parameter inscore
.
Returns
self
:object
- The updated object.
Expand source code
def func(*args, **kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. " f"Accepted arguments are: {set(self.keys)}" ) # This makes it possible to use the decorated method as an unbound method, # for instance when monkeypatching. # https://github.com/scikit-learn/scikit-learn/issues/28632 if instance is None: _instance = args[0] args = args[1:] else: _instance = instance # Replicating python's behavior when positional args are given other than # `self`, and `self` is only allowed if this method is unbound. if args: raise TypeError( f"set_{self.name}_request() takes 0 positional argument but" f" {len(args)} were given" ) requests = _instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) _instance._metadata_request = requests return _instance
-
Inherited members