Expand source code
import numbers
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.utils.validation import check_is_fitted, check_array
"""
The classes below (BasicDiscretizer and RFDiscretizer) provide
additional functionalities and wrappers around KBinsDiscretizer
from sklearn. In particular, the following AbstractDiscretizer classes
- take a data frame as input and output a data frame
- allow for discretization of a subset of columns in the data
frame and returns the full data frame with both the
discretized and non-discretized columns
- allow quantile bins to be a single point if necessary
"""
class AbstractDiscretizer(TransformerMixin, BaseEstimator):
"""
Discretize numeric data into bins. Base class.
Params
------
n_bins : int or array-like of shape (len(dcols),), default=2
Number of bins to discretize each feature into.
dcols : list of strings
The names of the columns to be discretized; by default,
discretize all float and int columns in X.
encode : {‘onehot’, ‘ordinal’}, default=’onehot’
Method used to encode the transformed result.
onehot
Encode the transformed result with one-hot encoding and
return a dense array.
ordinal
Return the bin identifier encoded as an integer value.
strategy : {‘uniform’, ‘quantile’, ‘kmeans’}, default=’quantile’
Strategy used to define the widths of the bins.
uniform
All bins in each feature have identical widths.
quantile
All bins in each feature have the same number of points.
kmeans
Values in each bin have the same nearest center of a 1D
k-means cluster.
onehot_drop : {‘first’, ‘if_binary’} or a array-like of shape (len(dcols),), default='if_binary'
Specifies a methodology to use to drop one of the categories
per feature when encode = "onehot".
None
Retain all features (the default).
‘first’
Drop the first y_str in each feature. If only one y_str
is present, the feature will be dropped entirely.
‘if_binary’
Drop the first y_str in each feature with two categories.
Features with 1 or more than 2 categories are left intact.
"""
def __init__(self, n_bins=2, dcols=[],
encode='onehot', strategy='quantile',
onehot_drop='if_binary'):
self.n_bins = n_bins
self.encode = encode
self.strategy = strategy
self.dcols = dcols
if encode == 'onehot':
self.onehot_drop = onehot_drop
def _validate_n_bins(self):
"""
Check if n_bins argument is valid.
"""
orig_bins = self.n_bins
n_features = len(self.dcols_)
if isinstance(orig_bins, numbers.Number):
if not isinstance(orig_bins, numbers.Integral):
raise ValueError(
"{} received an invalid n_bins type. "
"Received {}, expected int.".format(
AbstractDiscretizer.__name__, type(orig_bins).__name__
)
)
if orig_bins < 2:
raise ValueError(
"{} received an invalid number "
"of bins. Received {}, expected at least 2.".format(
AbstractDiscretizer.__name__, orig_bins
)
)
self.n_bins = np.full(n_features, orig_bins, dtype=int)
else:
n_bins = check_array(orig_bins, dtype=int,
copy=True, ensure_2d=False)
if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
raise ValueError(
"n_bins must be a scalar or array of shape (n_features,).")
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
violating_indices = np.where(bad_nbins_value)[0]
if violating_indices.shape[0] > 0:
indices = ", ".join(str(i) for i in violating_indices)
raise ValueError(
"{} received an invalid number "
"of bins at indices {}. Number of bins "
"must be at least 2, and must be an int.".format(
AbstractDiscretizer.__name__, indices
)
)
self.n_bins = n_bins
def _validate_dcols(self, X):
"""
Check if dcols argument is valid.
"""
for col in self.dcols_:
if col not in X.columns:
raise ValueError("{} is not a column in X.".format(col))
if not is_numeric_dtype(X[col].dtype):
raise ValueError("Cannot discretize non-numeric columns.")
def _validate_args(self):
"""
Check if encode, strategy arguments are valid.
"""
valid_encode = ('onehot', 'ordinal')
if self.encode not in valid_encode:
raise ValueError("Valid options for 'encode' are {}. Got encode={!r} instead."
.format(valid_encode, self.encode))
valid_strategy = ('uniform', 'quantile', 'kmeans')
if (self.strategy not in valid_strategy):
raise ValueError("Valid options for 'strategy' are {}. Got strategy={!r} instead."
.format(valid_strategy, self.strategy))
def _discretize_to_bins(self, x, bin_edges,
keep_pointwise_bins=False):
"""
Discretize data into bins of the form [a, b) given bin
edges/boundaries
Parameters
----------
x : array-like of shape (n_samples,)
Data vector to be discretized.
bin_edges : array-like
Values to serve as bin edges; should include min and
max values for the range of x
keep_pointwise_bins : boolean
If True, treat duplicate bin_edges as a pointwise bin,
i.e., [a, a]. If False, these bins are in effect ignored.
Returns
-------
xd: array of shape (n_samples,) where x has been
transformed to the binned space
"""
# ignore min and max values in bin generation
unique_edges = np.unique(bin_edges[1:-1])
if keep_pointwise_bins:
# note: min and max values are used to define pointwise bins
pointwise_bins = np.unique(
bin_edges[pd.Series(bin_edges).duplicated()])
else:
pointwise_bins = np.array([])
xd = np.zeros_like(x)
i = 1
for idx, split in enumerate(unique_edges):
if idx == (len(unique_edges) - 1): # uppermost bin
if (idx == 0) & (split in pointwise_bins):
# two bins total: (-inf, a], (a, inf)
indicator = x > split
else:
indicator = x >= split # uppermost bin: [a, inf)
else:
if split in pointwise_bins:
# create two bins: [a, a], (a, b)
indicator = (x > split) & (x < unique_edges[idx + 1]) #
if idx != 0:
xd[x == split] = i
i += 1
else:
# create bin: [a, b)
indicator = (x >= split) & (x < unique_edges[idx + 1])
xd[indicator] = i
i += 1
return xd.astype(int)
def _fit_preprocessing(self, X):
"""
Initial checks before fitting the estimator.
Parameters
----------
X : data frame of shape (n_samples, n_features)
(Training) data to be discretized.
Returns
-------
self
"""
# by default, discretize all numeric columns
if len(self.dcols) == 0:
numeric_cols = [
col for col in X.columns if is_numeric_dtype(X[col].dtype)]
self.dcols_ = numeric_cols
# error checking
self._validate_n_bins()
self._validate_args()
self._validate_dcols(X)
def _transform_postprocessing(self, discretized_df, X):
"""
Final processing in transform method. Does one-hot encoding
(if specified) and joins discretized columns to the
un-transformed columns in X.
Parameters
----------
discretized_df : data frame of shape (n_sample, len(dcols))
Discretized data in the transformed bin space.
X : data frame of shape (n_samples, n_features)
Data to be discretized.
Returns
-------
X_discretized : data frame
Data with features in dcols transformed to the
binned space. All other features remain unchanged.
Encoded either as ordinal or one-hot.
"""
discretized_df = discretized_df[self.dcols_]
# return onehot encoded X if specified
if self.encode == "onehot":
colnames = [str(col) for col in self.dcols_]
try:
onehot_col_names = self.onehot_.get_feature_names_out(colnames)
except:
onehot_col_names = self.onehot_.get_feature_names(
colnames) # older versions of sklearn
discretized_df = self.onehot_.transform(discretized_df.astype(str))
discretized_df = pd.DataFrame(discretized_df,
columns=onehot_col_names,
index=X.index).astype(int)
# join discretized columns with rest of X
cols = [col for col in X.columns if col not in self.dcols_]
X_discretized = pd.concat([discretized_df, X[cols]], axis=1)
return X_discretized
class ExtraBasicDiscretizer(TransformerMixin):
"""
Discretize provided columns into bins and return in one-hot format.
Generates meaningful column names based on bin edges.
Wraps KBinsDiscretizer from sklearn.
Params
------
dcols : list of strings
The names of the columns to be discretized.
n_bins : int or array-like of shape (len(dcols),), default=4
Number of bins to discretize each feature into.
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
Strategy used to define the widths of the bins.
uniform
All bins in each feature have identical widths.
quantile
All bins in each feature have the same number of points.
kmeans
Values in each bin have the same nearest center of a 1D
k-means cluster.
onehot_drop : {'first', 'if_binary'} or a array-like of shape (len(dcols),), default='if_binary'
Specifies a methodology to use to drop one of the categories
per feature when encode = "onehot".
None
Retain all features (the default).
'first'
Drop the first y_str in each feature. If only one y_str
is present, the feature will be dropped entirely.
'if_binary'
Drop the first y_str in each feature with two categories.
Features with 1 or more than 2 categories are left intact.
Attributes
----------
discretizer_ : object of class KBinsDiscretizer()
Primary discretization method used to bin numeric data
Examples
--------
"""
def __init__(self,
dcols,
n_bins=4,
strategy='quantile',
onehot_drop='if_binary'):
self.dcols = dcols
self.n_bins = n_bins
self.strategy = strategy
self.onehot_drop = onehot_drop
def fit(self, X, y=None):
"""
Fit the estimator.
Parameters
----------
X : data frame of shape (n_samples, n_features)
(Training) data to be discretized.
y : Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline` and fit_transform method
Returns
-------
self
"""
# Fit KBinsDiscretizer to the selected columns
discretizer = KBinsDiscretizer(
n_bins=self.n_bins, strategy=self.strategy, encode='ordinal')
discretizer.fit(X[self.dcols])
self.discretizer_ = discretizer
# Fit OneHotEncoder to the ordinal output of KBinsDiscretizer
disc_ordinal_np = discretizer.transform(X[self.dcols])
disc_ordinal_df = pd.DataFrame(disc_ordinal_np, columns=self.dcols)
disc_ordinal_df_str = disc_ordinal_df.astype(int).astype(str)
encoder = OneHotEncoder(drop=self.onehot_drop) # , sparse=False)
encoder.fit(disc_ordinal_df_str)
self.encoder_ = encoder
return self
def transform(self, X):
"""
Discretize the data.
Parameters
----------
X : data frame of shape (n_samples, n_features)
Data to be discretized.
Returns
-------
X_discretized : data frame
Data with features in dcols transformed to the
binned space. All other features remain unchanged.
"""
# Apply discretizer transform to get ordinally coded DF
disc_ordinal_np = self.discretizer_.transform(X[self.dcols])
disc_ordinal_df = pd.DataFrame(disc_ordinal_np, columns=self.dcols)
disc_ordinal_df_str = disc_ordinal_df.astype(int).astype(str)
# One-hot encode the ordinal DF
disc_onehot_np = self.encoder_.transform(disc_ordinal_df_str)
disc_onehot = pd.DataFrame(
disc_onehot_np, columns=self.encoder_.get_feature_names_out())
# Name columns after the interval they represent (e.g. 0.1_to_0.5)
for col, bin_edges in zip(self.dcols, self.discretizer_.bin_edges_):
bin_edges = bin_edges.astype(str)
for ordinal_value in disc_ordinal_df_str[col].unique():
bin_lb = bin_edges[int(ordinal_value)]
bin_ub = bin_edges[int(ordinal_value) + 1]
interval_string = f'{bin_lb}_to_{bin_ub}'
disc_onehot = disc_onehot.rename(
columns={f'{col}_{ordinal_value}': f'{col}_' + interval_string})
# Join discretized columns with rest of X
non_dcols = [col for col in X.columns if col not in self.dcols]
X_discretized = pd.concat([disc_onehot, X[non_dcols]], axis=1)
return X_discretized
class BasicDiscretizer(AbstractDiscretizer):
"""
Discretize numeric data into bins. Provides a wrapper around
KBinsDiscretizer from sklearn
Params
------
n_bins : int or array-like of shape (len(dcols),), default=2
Number of bins to discretize each feature into.
dcols : list of strings
The names of the columns to be discretized; by default,
discretize all float and int columns in X.
encode : {'onehot', 'ordinal'}, default='onehot'
Method used to encode the transformed result.
onehot
Encode the transformed result with one-hot encoding and
return a dense array.
ordinal
Return the bin identifier encoded as an integer value.
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
Strategy used to define the widths of the bins.
uniform
All bins in each feature have identical widths.
quantile
All bins in each feature have the same number of points.
kmeans
Values in each bin have the same nearest center of a 1D
k-means cluster.
onehot_drop : {‘first’, ‘if_binary’} or a array-like of shape (len(dcols),), default='if_binary'
Specifies a methodology to use to drop one of the categories
per feature when encode = "onehot".
None
Retain all features (the default).
‘first’
Drop the first y_str in each feature. If only one y_str
is present, the feature will be dropped entirely.
‘if_binary’
Drop the first y_str in each feature with two categories.
Features with 1 or more than 2 categories are left intact.
Attributes
----------
discretizer_ : object of class KBinsDiscretizer()
Primary discretization method used to bin numeric data
manual_discretizer_ : dictionary
Provides bin_edges to feed into _quantile_discretization()
and do quantile discretization manually for features where
KBinsDiscretizer() failed. Ignored if strategy != 'quantile'
or no errors in KBinsDiscretizer().
onehot_ : object of class OneHotEncoder()
One hot encoding fit. Ignored if encode != 'onehot'
Examples
--------
"""
def __init__(self, n_bins=2, dcols=[],
encode='onehot', strategy='quantile',
onehot_drop='if_binary'):
super().__init__(n_bins=n_bins, dcols=dcols,
encode=encode, strategy=strategy,
onehot_drop=onehot_drop)
def fit(self, X, y=None):
"""
Fit the estimator.
Parameters
----------
X : data frame of shape (n_samples, n_features)
(Training) data to be discretized.
y : Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline` and fit_transform method
Returns
-------
self
"""
# initialization and error checking
self._fit_preprocessing(X)
# apply KBinsDiscretizer to the selected columns
discretizer = KBinsDiscretizer(n_bins=self.n_bins,
encode='ordinal',
strategy=self.strategy)
discretizer.fit(X[self.dcols_])
self.discretizer_ = discretizer
if (self.encode == 'onehot') | (self.strategy == 'quantile'):
discretized_df = discretizer.transform(X[self.dcols_])
discretized_df = pd.DataFrame(discretized_df,
columns=self.dcols_,
index=X.index).astype(int)
# fix KBinsDiscretizer errors if any when strategy = "quantile"
if self.strategy == "quantile":
err_idx = np.where(discretized_df.nunique() != self.n_bins)[0]
self.manual_discretizer_ = dict()
for idx in err_idx:
col = self.dcols_[idx]
if X[col].nunique() > 1:
q_values = np.linspace(0, 1, self.n_bins[idx] + 1)
bin_edges = np.quantile(X[col], q_values)
discretized_df[col] = self._discretize_to_bins(X[col], bin_edges,
keep_pointwise_bins=True)
self.manual_discretizer_[col] = bin_edges
# fit onehot encoded X if specified
if self.encode == "onehot":
onehot = OneHotEncoder(drop=self.onehot_drop) # , sparse=False)
onehot.fit(discretized_df.astype(str))
self.onehot_ = onehot
return self
def transform(self, X):
"""
Discretize the data.
Parameters
----------
X : data frame of shape (n_samples, n_features)
Data to be discretized.
Returns
-------
X_discretized : data frame
Data with features in dcols transformed to the
binned space. All other features remain unchanged.
"""
check_is_fitted(self)
# transform using KBinsDiscretizer
discretized_df = self.discretizer_.transform(
X[self.dcols_]).astype(int)
discretized_df = pd.DataFrame(discretized_df,
columns=self.dcols_,
index=X.index)
# fix KBinsDiscretizer errors (if any) when strategy = "quantile"
if self.strategy == "quantile":
for col in self.manual_discretizer_.keys():
bin_edges = self.manual_discretizer_[col]
discretized_df[col] = self._discretize_to_bins(X[col], bin_edges,
keep_pointwise_bins=True)
# return onehot encoded data if specified and
# join discretized columns with rest of X
X_discretized = self._transform_postprocessing(discretized_df, X)
return X_discretized
class RFDiscretizer(AbstractDiscretizer):
"""
Discretize numeric data into bins using RF splits.
Parameters
----------
rf_model : RandomForestClassifer() or RandomForestRegressor()
RF model from which to extract splits for discretization.
Default is RandomForestClassifer(n_estimators = 500) or
RandomForestRegressor(n_estimators = 500)
classification : boolean; default=False
Used only if rf_model=None. If True,
rf_model=RandomForestClassifier(n_estimators = 500).
Else, rf_model=RandomForestRegressor(n_estimators = 500)
n_bins : int or array-like of shape (len(dcols),), default=2
Number of bins to discretize each feature into.
dcols : list of strings
The names of the columns to be discretized; by default,
discretize all float and int columns in X.
encode : {‘onehot’, ‘ordinal’}, default=’onehot’
Method used to encode the transformed result.
onehot - Encode the transformed result with one-hot encoding and
return a dense array.
ordinal - Return the bin identifier encoded as an integer value.
strategy : {‘uniform’, ‘quantile’}, default=’quantile’
Strategy used to choose RF split points.
uniform - RF split points chosen to be uniformly spaced out.
quantile - RF split points chosen based on equally-spaced quantiles.
backup_strategy : {‘uniform’, ‘quantile’, ‘kmeans’}, default=’quantile’
Strategy used to define the widths of the bins if no rf splits exist for
that feature. Used in KBinsDiscretizer.
uniform
All bins in each feature have identical widths.
quantile
All bins in each feature have the same number of points.
kmeans
Values in each bin have the same nearest center of a 1D
k-means cluster.
onehot_drop : {‘first’, ‘if_binary’} or array-like of shape (len(dcols),), default='if_binary'
Specifies a methodology to use to drop one of the categories
per feature when encode = "onehot".
None
Retain all features (the default).
‘first’
Drop the first y_str in each feature. If only one y_str
is present, the feature will be dropped entirely.
‘if_binary’
Drop the first y_str in each feature with two categories.
Features with 1 or more than 2 categories are left intact.
Attributes
----------
rf_splits : dictionary where
key = feature name
value = array of all RF split threshold values
bin_edges_ : dictionary where
key = feature name
value = array of bin edges used for discretization, taken from
RF split values
missing_rf_cols_ : array-like
List of features that were not used in RF
backup_discretizer_ : object of class BasicDiscretizer()
Discretization method used to bin numeric data for features
in missing_rf_cols_
onehot_ : object of class OneHotEncoder()
One hot encoding fit. Ignored if encode != 'onehot'
"""
def __init__(self, rf_model=None, classification=False,
n_bins=2, dcols=[], encode='onehot',
strategy='quantile', backup_strategy='quantile',
onehot_drop='if_binary'):
super().__init__(n_bins=n_bins, dcols=dcols,
encode=encode, strategy=strategy,
onehot_drop=onehot_drop)
self.backup_strategy = backup_strategy
self.rf_model = rf_model
if rf_model is None:
self.classification = classification
def _validate_args(self):
"""
Check if encode, strategy, backup_strategy arguments are valid.
"""
super()._validate_args()
valid_backup_strategy = ('uniform', 'quantile', 'kmeans')
if (self.backup_strategy not in valid_backup_strategy):
raise ValueError("Valid options for 'strategy' are {}. Got strategy={!r} instead."
.format(valid_backup_strategy, self.backup_strategy))
def _get_rf_splits(self, col_names):
"""
Get all splits in random forest ensemble
Parameters
----------
col_names : array-like of shape (n_features,)
Column names for X used to train rf_model
Returns
-------
rule_dict : dictionary where
key = feature name
value = array of all RF split threshold values
"""
rule_dict = {}
for model in self.rf_model.estimators_:
tree = model.tree_
tree_it = enumerate(zip(tree.children_left,
tree.children_right,
tree.feature,
tree.threshold))
for node_idx, data in tree_it:
left, right, feature, th = data
if (left != -1) | (right != -1):
feature = col_names[feature]
if feature in rule_dict:
rule_dict[feature].append(th)
else:
rule_dict[feature] = [th]
return rule_dict
def _fit_rf(self, X, y=None):
"""
Fit random forest (if necessary) and obtain RF split thresholds
Parameters
----------
X : data frame of shape (n_samples, n_features)
Training data used to fit RF
y : array-like of shape (n_samples,)
Training response vector used to fit RF
Returns
-------
rf_splits : dictionary where
key = feature name
value = array of all RF split threshold values
"""
# If no rf_model given, train default random forest model
if self.rf_model is None:
if y is None:
raise ValueError("Must provide y if rf_model is not given.")
if self.classification:
self.rf_model = RandomForestClassifier(n_estimators=500)
else:
self.rf_model = RandomForestRegressor(n_estimators=500)
self.rf_model.fit(X, y)
else:
# provided rf model has not yet been trained
if not check_is_fitted(self.rf_model):
if y is None:
raise ValueError(
"Must provide y if rf_model has not been trained.")
self.rf_model.fit(X, y)
# get all random forest split points
self.rf_splits = self._get_rf_splits(list(X.columns))
def reweight_n_bins(self, X, y=None, by="nsplits"):
"""
Reallocate number of bins per feature.
Parameters
----------
X : data frame of shape (n_samples, n_features)
(Training) data to be discretized.
y : array-like of shape (n_samples,)
(Training) response vector. Required only if
rf_model = None or rf_model has not yet been fitted
by : {'nsplits'}, default='nsplits'
Specifies how to reallocate number of bins per feature.
nsplits
Reallocate number of bins so that each feature
in dcols get at a minimum of 2 bins with the
remaining bins distributed proportionally to the
number of RF splits using that feature
Returns
-------
self.n_bins : array of shape (len(dcols),)
number of bins per feature reallocated according to
'by' argument
"""
# initialization and error checking
self._fit_preprocessing(X)
# get all random forest split points
self._fit_rf(X=X, y=y)
# get total number of bins to reallocate
total_bins = self.n_bins.sum()
# reweight n_bins
if by == "nsplits":
# each col gets at least 2 bins; remaining bins get
# reallocated based on number of RF splits using that feature
n_rules = np.array([len(self.rf_splits[col])
for col in self.dcols_])
self.n_bins = np.round(n_rules / n_rules.sum() *
(total_bins - 2 * len(self.dcols_))) + 2
else:
valid_by = ('nsplits')
raise ValueError("Valid options for 'by' are {}. Got by={!r} instead."
.format(valid_by, by))
def fit(self, X, y=None):
"""
Fit the estimator.
Parameters
----------
X : data frame of shape (n_samples, n_features)
(Training) data to be discretized.
y : array-like of shape (n_samples,)
(Training) response vector. Required only if
rf_model = None or rf_model has not yet been fitted
Returns
-------
self
"""
# initialization and error checking
self._fit_preprocessing(X)
# get all random forest split points
self._fit_rf(X=X, y=y)
# features that were not used in the rf but need to be discretized
self.missing_rf_cols_ = list(set(self.dcols_) -
set(self.rf_splits.keys()))
if len(self.missing_rf_cols_) > 0:
print("{} did not appear in random forest so were discretized via {} discretization"
.format(self.missing_rf_cols_, self.strategy))
missing_n_bins = np.array([self.n_bins[np.array(self.dcols_) == col][0]
for col in self.missing_rf_cols_])
backup_discretizer = BasicDiscretizer(n_bins=missing_n_bins,
dcols=self.missing_rf_cols_,
encode='ordinal',
strategy=self.backup_strategy)
backup_discretizer.fit(X[self.missing_rf_cols_])
self.backup_discretizer_ = backup_discretizer
else:
self.backup_discretizer_ = None
if self.encode == 'onehot':
if len(self.missing_rf_cols_) > 0:
discretized_df = backup_discretizer.transform(
X[self.missing_rf_cols_])
else:
discretized_df = pd.DataFrame({}, index=X.index)
# do discretization based on rf split thresholds
self.bin_edges_ = dict()
for col in self.dcols_:
if col in self.rf_splits.keys():
b = self.n_bins[np.array(self.dcols_) == col]
if self.strategy == "quantile":
q_values = np.linspace(0, 1, int(b) + 1)
bin_edges = np.quantile(self.rf_splits[col], q_values)
elif self.strategy == "uniform":
width = (max(self.rf_splits[col]) -
min(self.rf_splits[col])) / b
bin_edges = width * \
np.arange(0, b + 1) + min(self.rf_splits[col])
self.bin_edges_[col] = bin_edges
if self.encode == 'onehot':
discretized_df[col] = self._discretize_to_bins(
X[col], bin_edges)
# fit onehot encoded X if specified
if self.encode == "onehot":
onehot = OneHotEncoder(drop=self.onehot_drop) # , sparse=False)
onehot.fit(discretized_df[self.dcols_].astype(str))
self.onehot_ = onehot
return self
def transform(self, X):
"""
Discretize the data.
Parameters
----------
X : data frame of shape (n_samples, n_features)
Data to be discretized.
Returns
-------
X_discretized : data frame
Data with features in dcols transformed to the
binned space. All other features remain unchanged.
"""
check_is_fitted(self)
# transform features that did not appear in RF
if len(self.missing_rf_cols_) > 0:
discretized_df = self.backup_discretizer_.transform(
X[self.missing_rf_cols_])
discretized_df = pd.DataFrame(discretized_df,
columns=self.missing_rf_cols_,
index=X.index)
else:
discretized_df = pd.DataFrame({}, index=X.index)
# do discretization based on rf split thresholds
for col in self.bin_edges_.keys():
discretized_df[col] = self._discretize_to_bins(
X[col], self.bin_edges_[col])
# return onehot encoded data if specified and
# join discretized columns with rest of X
X_discretized = self._transform_postprocessing(discretized_df, X)
return X_discretized
Classes
class AbstractDiscretizer (n_bins=2, dcols=[], encode='onehot', strategy='quantile', onehot_drop='if_binary')
-
Discretize numeric data into bins. Base class.
Params
n_bins : int or array-like of shape (len(dcols),), default=2 Number of bins to discretize each feature into.
dcols : list of strings The names of the columns to be discretized; by default, discretize all float and int columns in X.
encode : {‘onehot’, ‘ordinal’}, default=’onehot’ Method used to encode the transformed result.
onehot Encode the transformed result with one-hot encoding and return a dense array. ordinal Return the bin identifier encoded as an integer value.
strategy : {‘uniform’, ‘quantile’, ‘kmeans’}, default=’quantile’ Strategy used to define the widths of the bins.
uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster.
onehot_drop : {‘first’, ‘if_binary’} or a array-like of shape (len(dcols),), default='if_binary' Specifies a methodology to use to drop one of the categories per feature when encode = "onehot".
None Retain all features (the default). ‘first’ Drop the first y_str in each feature. If only one y_str is present, the feature will be dropped entirely. ‘if_binary’ Drop the first y_str in each feature with two categories. Features with 1 or more than 2 categories are left intact.
Expand source code
class AbstractDiscretizer(TransformerMixin, BaseEstimator): """ Discretize numeric data into bins. Base class. Params ------ n_bins : int or array-like of shape (len(dcols),), default=2 Number of bins to discretize each feature into. dcols : list of strings The names of the columns to be discretized; by default, discretize all float and int columns in X. encode : {‘onehot’, ‘ordinal’}, default=’onehot’ Method used to encode the transformed result. onehot Encode the transformed result with one-hot encoding and return a dense array. ordinal Return the bin identifier encoded as an integer value. strategy : {‘uniform’, ‘quantile’, ‘kmeans’}, default=’quantile’ Strategy used to define the widths of the bins. uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster. onehot_drop : {‘first’, ‘if_binary’} or a array-like of shape (len(dcols),), default='if_binary' Specifies a methodology to use to drop one of the categories per feature when encode = "onehot". None Retain all features (the default). ‘first’ Drop the first y_str in each feature. If only one y_str is present, the feature will be dropped entirely. ‘if_binary’ Drop the first y_str in each feature with two categories. Features with 1 or more than 2 categories are left intact. """ def __init__(self, n_bins=2, dcols=[], encode='onehot', strategy='quantile', onehot_drop='if_binary'): self.n_bins = n_bins self.encode = encode self.strategy = strategy self.dcols = dcols if encode == 'onehot': self.onehot_drop = onehot_drop def _validate_n_bins(self): """ Check if n_bins argument is valid. """ orig_bins = self.n_bins n_features = len(self.dcols_) if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): raise ValueError( "{} received an invalid n_bins type. " "Received {}, expected int.".format( AbstractDiscretizer.__name__, type(orig_bins).__name__ ) ) if orig_bins < 2: raise ValueError( "{} received an invalid number " "of bins. Received {}, expected at least 2.".format( AbstractDiscretizer.__name__, orig_bins ) ) self.n_bins = np.full(n_features, orig_bins, dtype=int) else: n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False) if n_bins.ndim > 1 or n_bins.shape[0] != n_features: raise ValueError( "n_bins must be a scalar or array of shape (n_features,).") bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins) violating_indices = np.where(bad_nbins_value)[0] if violating_indices.shape[0] > 0: indices = ", ".join(str(i) for i in violating_indices) raise ValueError( "{} received an invalid number " "of bins at indices {}. Number of bins " "must be at least 2, and must be an int.".format( AbstractDiscretizer.__name__, indices ) ) self.n_bins = n_bins def _validate_dcols(self, X): """ Check if dcols argument is valid. """ for col in self.dcols_: if col not in X.columns: raise ValueError("{} is not a column in X.".format(col)) if not is_numeric_dtype(X[col].dtype): raise ValueError("Cannot discretize non-numeric columns.") def _validate_args(self): """ Check if encode, strategy arguments are valid. """ valid_encode = ('onehot', 'ordinal') if self.encode not in valid_encode: raise ValueError("Valid options for 'encode' are {}. Got encode={!r} instead." .format(valid_encode, self.encode)) valid_strategy = ('uniform', 'quantile', 'kmeans') if (self.strategy not in valid_strategy): raise ValueError("Valid options for 'strategy' are {}. Got strategy={!r} instead." .format(valid_strategy, self.strategy)) def _discretize_to_bins(self, x, bin_edges, keep_pointwise_bins=False): """ Discretize data into bins of the form [a, b) given bin edges/boundaries Parameters ---------- x : array-like of shape (n_samples,) Data vector to be discretized. bin_edges : array-like Values to serve as bin edges; should include min and max values for the range of x keep_pointwise_bins : boolean If True, treat duplicate bin_edges as a pointwise bin, i.e., [a, a]. If False, these bins are in effect ignored. Returns ------- xd: array of shape (n_samples,) where x has been transformed to the binned space """ # ignore min and max values in bin generation unique_edges = np.unique(bin_edges[1:-1]) if keep_pointwise_bins: # note: min and max values are used to define pointwise bins pointwise_bins = np.unique( bin_edges[pd.Series(bin_edges).duplicated()]) else: pointwise_bins = np.array([]) xd = np.zeros_like(x) i = 1 for idx, split in enumerate(unique_edges): if idx == (len(unique_edges) - 1): # uppermost bin if (idx == 0) & (split in pointwise_bins): # two bins total: (-inf, a], (a, inf) indicator = x > split else: indicator = x >= split # uppermost bin: [a, inf) else: if split in pointwise_bins: # create two bins: [a, a], (a, b) indicator = (x > split) & (x < unique_edges[idx + 1]) # if idx != 0: xd[x == split] = i i += 1 else: # create bin: [a, b) indicator = (x >= split) & (x < unique_edges[idx + 1]) xd[indicator] = i i += 1 return xd.astype(int) def _fit_preprocessing(self, X): """ Initial checks before fitting the estimator. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. Returns ------- self """ # by default, discretize all numeric columns if len(self.dcols) == 0: numeric_cols = [ col for col in X.columns if is_numeric_dtype(X[col].dtype)] self.dcols_ = numeric_cols # error checking self._validate_n_bins() self._validate_args() self._validate_dcols(X) def _transform_postprocessing(self, discretized_df, X): """ Final processing in transform method. Does one-hot encoding (if specified) and joins discretized columns to the un-transformed columns in X. Parameters ---------- discretized_df : data frame of shape (n_sample, len(dcols)) Discretized data in the transformed bin space. X : data frame of shape (n_samples, n_features) Data to be discretized. Returns ------- X_discretized : data frame Data with features in dcols transformed to the binned space. All other features remain unchanged. Encoded either as ordinal or one-hot. """ discretized_df = discretized_df[self.dcols_] # return onehot encoded X if specified if self.encode == "onehot": colnames = [str(col) for col in self.dcols_] try: onehot_col_names = self.onehot_.get_feature_names_out(colnames) except: onehot_col_names = self.onehot_.get_feature_names( colnames) # older versions of sklearn discretized_df = self.onehot_.transform(discretized_df.astype(str)) discretized_df = pd.DataFrame(discretized_df, columns=onehot_col_names, index=X.index).astype(int) # join discretized columns with rest of X cols = [col for col in X.columns if col not in self.dcols_] X_discretized = pd.concat([discretized_df, X[cols]], axis=1) return X_discretized
Ancestors
- sklearn.base.TransformerMixin
- sklearn.utils._set_output._SetOutputMixin
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
Subclasses
class BasicDiscretizer (n_bins=2, dcols=[], encode='onehot', strategy='quantile', onehot_drop='if_binary')
-
Discretize numeric data into bins. Provides a wrapper around KBinsDiscretizer from sklearn
Params
n_bins : int or array-like of shape (len(dcols),), default=2 Number of bins to discretize each feature into.
dcols : list of strings The names of the columns to be discretized; by default, discretize all float and int columns in X.
encode : {'onehot', 'ordinal'}, default='onehot' Method used to encode the transformed result.
onehot Encode the transformed result with one-hot encoding and return a dense array. ordinal Return the bin identifier encoded as an integer value.
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' Strategy used to define the widths of the bins.
uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster.
onehot_drop : {‘first’, ‘if_binary’} or a array-like of shape (len(dcols),), default='if_binary' Specifies a methodology to use to drop one of the categories per feature when encode = "onehot".
None Retain all features (the default). ‘first’ Drop the first y_str in each feature. If only one y_str is present, the feature will be dropped entirely. ‘if_binary’ Drop the first y_str in each feature with two categories. Features with 1 or more than 2 categories are left intact.
Attributes
discretizer_
:object
ofclass KBinsDiscretizer()
- Primary discretization method used to bin numeric data
manual_discretizer_
:dictionary
- Provides bin_edges to feed into _quantile_discretization() and do quantile discretization manually for features where KBinsDiscretizer() failed. Ignored if strategy != 'quantile' or no errors in KBinsDiscretizer().
onehot_
:object
ofclass OneHotEncoder()
- One hot encoding fit. Ignored if encode != 'onehot'
Examples
Expand source code
class BasicDiscretizer(AbstractDiscretizer): """ Discretize numeric data into bins. Provides a wrapper around KBinsDiscretizer from sklearn Params ------ n_bins : int or array-like of shape (len(dcols),), default=2 Number of bins to discretize each feature into. dcols : list of strings The names of the columns to be discretized; by default, discretize all float and int columns in X. encode : {'onehot', 'ordinal'}, default='onehot' Method used to encode the transformed result. onehot Encode the transformed result with one-hot encoding and return a dense array. ordinal Return the bin identifier encoded as an integer value. strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' Strategy used to define the widths of the bins. uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster. onehot_drop : {‘first’, ‘if_binary’} or a array-like of shape (len(dcols),), default='if_binary' Specifies a methodology to use to drop one of the categories per feature when encode = "onehot". None Retain all features (the default). ‘first’ Drop the first y_str in each feature. If only one y_str is present, the feature will be dropped entirely. ‘if_binary’ Drop the first y_str in each feature with two categories. Features with 1 or more than 2 categories are left intact. Attributes ---------- discretizer_ : object of class KBinsDiscretizer() Primary discretization method used to bin numeric data manual_discretizer_ : dictionary Provides bin_edges to feed into _quantile_discretization() and do quantile discretization manually for features where KBinsDiscretizer() failed. Ignored if strategy != 'quantile' or no errors in KBinsDiscretizer(). onehot_ : object of class OneHotEncoder() One hot encoding fit. Ignored if encode != 'onehot' Examples -------- """ def __init__(self, n_bins=2, dcols=[], encode='onehot', strategy='quantile', onehot_drop='if_binary'): super().__init__(n_bins=n_bins, dcols=dcols, encode=encode, strategy=strategy, onehot_drop=onehot_drop) def fit(self, X, y=None): """ Fit the estimator. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. y : Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline` and fit_transform method Returns ------- self """ # initialization and error checking self._fit_preprocessing(X) # apply KBinsDiscretizer to the selected columns discretizer = KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy=self.strategy) discretizer.fit(X[self.dcols_]) self.discretizer_ = discretizer if (self.encode == 'onehot') | (self.strategy == 'quantile'): discretized_df = discretizer.transform(X[self.dcols_]) discretized_df = pd.DataFrame(discretized_df, columns=self.dcols_, index=X.index).astype(int) # fix KBinsDiscretizer errors if any when strategy = "quantile" if self.strategy == "quantile": err_idx = np.where(discretized_df.nunique() != self.n_bins)[0] self.manual_discretizer_ = dict() for idx in err_idx: col = self.dcols_[idx] if X[col].nunique() > 1: q_values = np.linspace(0, 1, self.n_bins[idx] + 1) bin_edges = np.quantile(X[col], q_values) discretized_df[col] = self._discretize_to_bins(X[col], bin_edges, keep_pointwise_bins=True) self.manual_discretizer_[col] = bin_edges # fit onehot encoded X if specified if self.encode == "onehot": onehot = OneHotEncoder(drop=self.onehot_drop) # , sparse=False) onehot.fit(discretized_df.astype(str)) self.onehot_ = onehot return self def transform(self, X): """ Discretize the data. Parameters ---------- X : data frame of shape (n_samples, n_features) Data to be discretized. Returns ------- X_discretized : data frame Data with features in dcols transformed to the binned space. All other features remain unchanged. """ check_is_fitted(self) # transform using KBinsDiscretizer discretized_df = self.discretizer_.transform( X[self.dcols_]).astype(int) discretized_df = pd.DataFrame(discretized_df, columns=self.dcols_, index=X.index) # fix KBinsDiscretizer errors (if any) when strategy = "quantile" if self.strategy == "quantile": for col in self.manual_discretizer_.keys(): bin_edges = self.manual_discretizer_[col] discretized_df[col] = self._discretize_to_bins(X[col], bin_edges, keep_pointwise_bins=True) # return onehot encoded data if specified and # join discretized columns with rest of X X_discretized = self._transform_postprocessing(discretized_df, X) return X_discretized
Ancestors
- AbstractDiscretizer
- sklearn.base.TransformerMixin
- sklearn.utils._set_output._SetOutputMixin
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
Methods
def fit(self, X, y=None)
-
Fit the estimator.
Parameters
X
:data frame
ofshape (n_samples, n_features)
- (Training) data to be discretized.
y
:Ignored. This parameter exists only for compatibility with
- :class:
~sklearn.pipeline.Pipeline
and fit_transform method
Returns
self
Expand source code
def fit(self, X, y=None): """ Fit the estimator. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. y : Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline` and fit_transform method Returns ------- self """ # initialization and error checking self._fit_preprocessing(X) # apply KBinsDiscretizer to the selected columns discretizer = KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy=self.strategy) discretizer.fit(X[self.dcols_]) self.discretizer_ = discretizer if (self.encode == 'onehot') | (self.strategy == 'quantile'): discretized_df = discretizer.transform(X[self.dcols_]) discretized_df = pd.DataFrame(discretized_df, columns=self.dcols_, index=X.index).astype(int) # fix KBinsDiscretizer errors if any when strategy = "quantile" if self.strategy == "quantile": err_idx = np.where(discretized_df.nunique() != self.n_bins)[0] self.manual_discretizer_ = dict() for idx in err_idx: col = self.dcols_[idx] if X[col].nunique() > 1: q_values = np.linspace(0, 1, self.n_bins[idx] + 1) bin_edges = np.quantile(X[col], q_values) discretized_df[col] = self._discretize_to_bins(X[col], bin_edges, keep_pointwise_bins=True) self.manual_discretizer_[col] = bin_edges # fit onehot encoded X if specified if self.encode == "onehot": onehot = OneHotEncoder(drop=self.onehot_drop) # , sparse=False) onehot.fit(discretized_df.astype(str)) self.onehot_ = onehot return self
def transform(self, X)
-
Discretize the data.
Parameters
X
:data frame
ofshape (n_samples, n_features)
- Data to be discretized.
Returns
X_discretized
:data frame
- Data with features in dcols transformed to the binned space. All other features remain unchanged.
Expand source code
def transform(self, X): """ Discretize the data. Parameters ---------- X : data frame of shape (n_samples, n_features) Data to be discretized. Returns ------- X_discretized : data frame Data with features in dcols transformed to the binned space. All other features remain unchanged. """ check_is_fitted(self) # transform using KBinsDiscretizer discretized_df = self.discretizer_.transform( X[self.dcols_]).astype(int) discretized_df = pd.DataFrame(discretized_df, columns=self.dcols_, index=X.index) # fix KBinsDiscretizer errors (if any) when strategy = "quantile" if self.strategy == "quantile": for col in self.manual_discretizer_.keys(): bin_edges = self.manual_discretizer_[col] discretized_df[col] = self._discretize_to_bins(X[col], bin_edges, keep_pointwise_bins=True) # return onehot encoded data if specified and # join discretized columns with rest of X X_discretized = self._transform_postprocessing(discretized_df, X) return X_discretized
class ExtraBasicDiscretizer (dcols, n_bins=4, strategy='quantile', onehot_drop='if_binary')
-
Discretize provided columns into bins and return in one-hot format. Generates meaningful column names based on bin edges. Wraps KBinsDiscretizer from sklearn.
Params
dcols : list of strings The names of the columns to be discretized.
n_bins : int or array-like of shape (len(dcols),), default=4 Number of bins to discretize each feature into.
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' Strategy used to define the widths of the bins.
uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster.
onehot_drop : {'first', 'if_binary'} or a array-like of shape (len(dcols),), default='if_binary' Specifies a methodology to use to drop one of the categories per feature when encode = "onehot".
None Retain all features (the default). 'first' Drop the first y_str in each feature. If only one y_str is present, the feature will be dropped entirely. 'if_binary' Drop the first y_str in each feature with two categories. Features with 1 or more than 2 categories are left intact.
Attributes
discretizer_
:object
ofclass KBinsDiscretizer()
- Primary discretization method used to bin numeric data
Examples
Expand source code
class ExtraBasicDiscretizer(TransformerMixin): """ Discretize provided columns into bins and return in one-hot format. Generates meaningful column names based on bin edges. Wraps KBinsDiscretizer from sklearn. Params ------ dcols : list of strings The names of the columns to be discretized. n_bins : int or array-like of shape (len(dcols),), default=4 Number of bins to discretize each feature into. strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' Strategy used to define the widths of the bins. uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster. onehot_drop : {'first', 'if_binary'} or a array-like of shape (len(dcols),), default='if_binary' Specifies a methodology to use to drop one of the categories per feature when encode = "onehot". None Retain all features (the default). 'first' Drop the first y_str in each feature. If only one y_str is present, the feature will be dropped entirely. 'if_binary' Drop the first y_str in each feature with two categories. Features with 1 or more than 2 categories are left intact. Attributes ---------- discretizer_ : object of class KBinsDiscretizer() Primary discretization method used to bin numeric data Examples -------- """ def __init__(self, dcols, n_bins=4, strategy='quantile', onehot_drop='if_binary'): self.dcols = dcols self.n_bins = n_bins self.strategy = strategy self.onehot_drop = onehot_drop def fit(self, X, y=None): """ Fit the estimator. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. y : Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline` and fit_transform method Returns ------- self """ # Fit KBinsDiscretizer to the selected columns discretizer = KBinsDiscretizer( n_bins=self.n_bins, strategy=self.strategy, encode='ordinal') discretizer.fit(X[self.dcols]) self.discretizer_ = discretizer # Fit OneHotEncoder to the ordinal output of KBinsDiscretizer disc_ordinal_np = discretizer.transform(X[self.dcols]) disc_ordinal_df = pd.DataFrame(disc_ordinal_np, columns=self.dcols) disc_ordinal_df_str = disc_ordinal_df.astype(int).astype(str) encoder = OneHotEncoder(drop=self.onehot_drop) # , sparse=False) encoder.fit(disc_ordinal_df_str) self.encoder_ = encoder return self def transform(self, X): """ Discretize the data. Parameters ---------- X : data frame of shape (n_samples, n_features) Data to be discretized. Returns ------- X_discretized : data frame Data with features in dcols transformed to the binned space. All other features remain unchanged. """ # Apply discretizer transform to get ordinally coded DF disc_ordinal_np = self.discretizer_.transform(X[self.dcols]) disc_ordinal_df = pd.DataFrame(disc_ordinal_np, columns=self.dcols) disc_ordinal_df_str = disc_ordinal_df.astype(int).astype(str) # One-hot encode the ordinal DF disc_onehot_np = self.encoder_.transform(disc_ordinal_df_str) disc_onehot = pd.DataFrame( disc_onehot_np, columns=self.encoder_.get_feature_names_out()) # Name columns after the interval they represent (e.g. 0.1_to_0.5) for col, bin_edges in zip(self.dcols, self.discretizer_.bin_edges_): bin_edges = bin_edges.astype(str) for ordinal_value in disc_ordinal_df_str[col].unique(): bin_lb = bin_edges[int(ordinal_value)] bin_ub = bin_edges[int(ordinal_value) + 1] interval_string = f'{bin_lb}_to_{bin_ub}' disc_onehot = disc_onehot.rename( columns={f'{col}_{ordinal_value}': f'{col}_' + interval_string}) # Join discretized columns with rest of X non_dcols = [col for col in X.columns if col not in self.dcols] X_discretized = pd.concat([disc_onehot, X[non_dcols]], axis=1) return X_discretized
Ancestors
- sklearn.base.TransformerMixin
- sklearn.utils._set_output._SetOutputMixin
Methods
def fit(self, X, y=None)
-
Fit the estimator.
Parameters
X
:data frame
ofshape (n_samples, n_features)
- (Training) data to be discretized.
y
:Ignored. This parameter exists only for compatibility with
- :class:
~sklearn.pipeline.Pipeline
and fit_transform method
Returns
self
Expand source code
def fit(self, X, y=None): """ Fit the estimator. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. y : Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline` and fit_transform method Returns ------- self """ # Fit KBinsDiscretizer to the selected columns discretizer = KBinsDiscretizer( n_bins=self.n_bins, strategy=self.strategy, encode='ordinal') discretizer.fit(X[self.dcols]) self.discretizer_ = discretizer # Fit OneHotEncoder to the ordinal output of KBinsDiscretizer disc_ordinal_np = discretizer.transform(X[self.dcols]) disc_ordinal_df = pd.DataFrame(disc_ordinal_np, columns=self.dcols) disc_ordinal_df_str = disc_ordinal_df.astype(int).astype(str) encoder = OneHotEncoder(drop=self.onehot_drop) # , sparse=False) encoder.fit(disc_ordinal_df_str) self.encoder_ = encoder return self
def transform(self, X)
-
Discretize the data.
Parameters
X
:data frame
ofshape (n_samples, n_features)
- Data to be discretized.
Returns
X_discretized
:data frame
- Data with features in dcols transformed to the binned space. All other features remain unchanged.
Expand source code
def transform(self, X): """ Discretize the data. Parameters ---------- X : data frame of shape (n_samples, n_features) Data to be discretized. Returns ------- X_discretized : data frame Data with features in dcols transformed to the binned space. All other features remain unchanged. """ # Apply discretizer transform to get ordinally coded DF disc_ordinal_np = self.discretizer_.transform(X[self.dcols]) disc_ordinal_df = pd.DataFrame(disc_ordinal_np, columns=self.dcols) disc_ordinal_df_str = disc_ordinal_df.astype(int).astype(str) # One-hot encode the ordinal DF disc_onehot_np = self.encoder_.transform(disc_ordinal_df_str) disc_onehot = pd.DataFrame( disc_onehot_np, columns=self.encoder_.get_feature_names_out()) # Name columns after the interval they represent (e.g. 0.1_to_0.5) for col, bin_edges in zip(self.dcols, self.discretizer_.bin_edges_): bin_edges = bin_edges.astype(str) for ordinal_value in disc_ordinal_df_str[col].unique(): bin_lb = bin_edges[int(ordinal_value)] bin_ub = bin_edges[int(ordinal_value) + 1] interval_string = f'{bin_lb}_to_{bin_ub}' disc_onehot = disc_onehot.rename( columns={f'{col}_{ordinal_value}': f'{col}_' + interval_string}) # Join discretized columns with rest of X non_dcols = [col for col in X.columns if col not in self.dcols] X_discretized = pd.concat([disc_onehot, X[non_dcols]], axis=1) return X_discretized
class RFDiscretizer (rf_model=None, classification=False, n_bins=2, dcols=[], encode='onehot', strategy='quantile', backup_strategy='quantile', onehot_drop='if_binary')
-
Discretize numeric data into bins using RF splits.
Parameters
rf_model
:RandomForestClassifer()
orRandomForestRegressor()
- RF model from which to extract splits for discretization. Default is RandomForestClassifer(n_estimators = 500) or RandomForestRegressor(n_estimators = 500)
classification
:boolean; default=False
- Used only if rf_model=None. If True, rf_model=RandomForestClassifier(n_estimators = 500). Else, rf_model=RandomForestRegressor(n_estimators = 500)
n_bins
:int
orarray-like
ofshape (len(dcols),)
, default=2
- Number of bins to discretize each feature into.
dcols
:list
ofstrings
- The names of the columns to be discretized; by default, discretize all float and int columns in X.
encode
:{‘onehot’, ‘ordinal’}
, default=’onehot’
-
Method used to encode the transformed result.
onehot - Encode the transformed result with one-hot encoding and return a dense array. ordinal - Return the bin identifier encoded as an integer value.
strategy
:{‘uniform’, ‘quantile’}
, default=’quantile’
- Strategy used to choose RF split points. uniform - RF split points chosen to be uniformly spaced out. quantile - RF split points chosen based on equally-spaced quantiles.
backup_strategy
:{‘uniform’, ‘quantile’, ‘kmeans’}
, default=’quantile’
- Strategy used to define the widths of the bins if no rf splits exist for that feature. Used in KBinsDiscretizer. uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster.
onehot_drop
:{‘first’, ‘if_binary’}
orarray-like
ofshape (len(dcols),)
, default='if_binary'
- Specifies a methodology to use to drop one of the categories per feature when encode = "onehot". None Retain all features (the default). ‘first’ Drop the first y_str in each feature. If only one y_str is present, the feature will be dropped entirely. ‘if_binary’ Drop the first y_str in each feature with two categories. Features with 1 or more than 2 categories are left intact.
Attributes
rf_splits
:dictionary where
- key = feature name value = array of all RF split threshold values
bin_edges_
:dictionary where
- key = feature name value = array of bin edges used for discretization, taken from RF split values
missing_rf_cols_
:array-like
- List of features that were not used in RF
backup_discretizer_
:object
ofclass BasicDiscretizer
- Discretization method used to bin numeric data for features in missing_rf_cols_
onehot_
:object
ofclass OneHotEncoder()
- One hot encoding fit. Ignored if encode != 'onehot'
Expand source code
class RFDiscretizer(AbstractDiscretizer): """ Discretize numeric data into bins using RF splits. Parameters ---------- rf_model : RandomForestClassifer() or RandomForestRegressor() RF model from which to extract splits for discretization. Default is RandomForestClassifer(n_estimators = 500) or RandomForestRegressor(n_estimators = 500) classification : boolean; default=False Used only if rf_model=None. If True, rf_model=RandomForestClassifier(n_estimators = 500). Else, rf_model=RandomForestRegressor(n_estimators = 500) n_bins : int or array-like of shape (len(dcols),), default=2 Number of bins to discretize each feature into. dcols : list of strings The names of the columns to be discretized; by default, discretize all float and int columns in X. encode : {‘onehot’, ‘ordinal’}, default=’onehot’ Method used to encode the transformed result. onehot - Encode the transformed result with one-hot encoding and return a dense array. ordinal - Return the bin identifier encoded as an integer value. strategy : {‘uniform’, ‘quantile’}, default=’quantile’ Strategy used to choose RF split points. uniform - RF split points chosen to be uniformly spaced out. quantile - RF split points chosen based on equally-spaced quantiles. backup_strategy : {‘uniform’, ‘quantile’, ‘kmeans’}, default=’quantile’ Strategy used to define the widths of the bins if no rf splits exist for that feature. Used in KBinsDiscretizer. uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster. onehot_drop : {‘first’, ‘if_binary’} or array-like of shape (len(dcols),), default='if_binary' Specifies a methodology to use to drop one of the categories per feature when encode = "onehot". None Retain all features (the default). ‘first’ Drop the first y_str in each feature. If only one y_str is present, the feature will be dropped entirely. ‘if_binary’ Drop the first y_str in each feature with two categories. Features with 1 or more than 2 categories are left intact. Attributes ---------- rf_splits : dictionary where key = feature name value = array of all RF split threshold values bin_edges_ : dictionary where key = feature name value = array of bin edges used for discretization, taken from RF split values missing_rf_cols_ : array-like List of features that were not used in RF backup_discretizer_ : object of class BasicDiscretizer() Discretization method used to bin numeric data for features in missing_rf_cols_ onehot_ : object of class OneHotEncoder() One hot encoding fit. Ignored if encode != 'onehot' """ def __init__(self, rf_model=None, classification=False, n_bins=2, dcols=[], encode='onehot', strategy='quantile', backup_strategy='quantile', onehot_drop='if_binary'): super().__init__(n_bins=n_bins, dcols=dcols, encode=encode, strategy=strategy, onehot_drop=onehot_drop) self.backup_strategy = backup_strategy self.rf_model = rf_model if rf_model is None: self.classification = classification def _validate_args(self): """ Check if encode, strategy, backup_strategy arguments are valid. """ super()._validate_args() valid_backup_strategy = ('uniform', 'quantile', 'kmeans') if (self.backup_strategy not in valid_backup_strategy): raise ValueError("Valid options for 'strategy' are {}. Got strategy={!r} instead." .format(valid_backup_strategy, self.backup_strategy)) def _get_rf_splits(self, col_names): """ Get all splits in random forest ensemble Parameters ---------- col_names : array-like of shape (n_features,) Column names for X used to train rf_model Returns ------- rule_dict : dictionary where key = feature name value = array of all RF split threshold values """ rule_dict = {} for model in self.rf_model.estimators_: tree = model.tree_ tree_it = enumerate(zip(tree.children_left, tree.children_right, tree.feature, tree.threshold)) for node_idx, data in tree_it: left, right, feature, th = data if (left != -1) | (right != -1): feature = col_names[feature] if feature in rule_dict: rule_dict[feature].append(th) else: rule_dict[feature] = [th] return rule_dict def _fit_rf(self, X, y=None): """ Fit random forest (if necessary) and obtain RF split thresholds Parameters ---------- X : data frame of shape (n_samples, n_features) Training data used to fit RF y : array-like of shape (n_samples,) Training response vector used to fit RF Returns ------- rf_splits : dictionary where key = feature name value = array of all RF split threshold values """ # If no rf_model given, train default random forest model if self.rf_model is None: if y is None: raise ValueError("Must provide y if rf_model is not given.") if self.classification: self.rf_model = RandomForestClassifier(n_estimators=500) else: self.rf_model = RandomForestRegressor(n_estimators=500) self.rf_model.fit(X, y) else: # provided rf model has not yet been trained if not check_is_fitted(self.rf_model): if y is None: raise ValueError( "Must provide y if rf_model has not been trained.") self.rf_model.fit(X, y) # get all random forest split points self.rf_splits = self._get_rf_splits(list(X.columns)) def reweight_n_bins(self, X, y=None, by="nsplits"): """ Reallocate number of bins per feature. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. y : array-like of shape (n_samples,) (Training) response vector. Required only if rf_model = None or rf_model has not yet been fitted by : {'nsplits'}, default='nsplits' Specifies how to reallocate number of bins per feature. nsplits Reallocate number of bins so that each feature in dcols get at a minimum of 2 bins with the remaining bins distributed proportionally to the number of RF splits using that feature Returns ------- self.n_bins : array of shape (len(dcols),) number of bins per feature reallocated according to 'by' argument """ # initialization and error checking self._fit_preprocessing(X) # get all random forest split points self._fit_rf(X=X, y=y) # get total number of bins to reallocate total_bins = self.n_bins.sum() # reweight n_bins if by == "nsplits": # each col gets at least 2 bins; remaining bins get # reallocated based on number of RF splits using that feature n_rules = np.array([len(self.rf_splits[col]) for col in self.dcols_]) self.n_bins = np.round(n_rules / n_rules.sum() * (total_bins - 2 * len(self.dcols_))) + 2 else: valid_by = ('nsplits') raise ValueError("Valid options for 'by' are {}. Got by={!r} instead." .format(valid_by, by)) def fit(self, X, y=None): """ Fit the estimator. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. y : array-like of shape (n_samples,) (Training) response vector. Required only if rf_model = None or rf_model has not yet been fitted Returns ------- self """ # initialization and error checking self._fit_preprocessing(X) # get all random forest split points self._fit_rf(X=X, y=y) # features that were not used in the rf but need to be discretized self.missing_rf_cols_ = list(set(self.dcols_) - set(self.rf_splits.keys())) if len(self.missing_rf_cols_) > 0: print("{} did not appear in random forest so were discretized via {} discretization" .format(self.missing_rf_cols_, self.strategy)) missing_n_bins = np.array([self.n_bins[np.array(self.dcols_) == col][0] for col in self.missing_rf_cols_]) backup_discretizer = BasicDiscretizer(n_bins=missing_n_bins, dcols=self.missing_rf_cols_, encode='ordinal', strategy=self.backup_strategy) backup_discretizer.fit(X[self.missing_rf_cols_]) self.backup_discretizer_ = backup_discretizer else: self.backup_discretizer_ = None if self.encode == 'onehot': if len(self.missing_rf_cols_) > 0: discretized_df = backup_discretizer.transform( X[self.missing_rf_cols_]) else: discretized_df = pd.DataFrame({}, index=X.index) # do discretization based on rf split thresholds self.bin_edges_ = dict() for col in self.dcols_: if col in self.rf_splits.keys(): b = self.n_bins[np.array(self.dcols_) == col] if self.strategy == "quantile": q_values = np.linspace(0, 1, int(b) + 1) bin_edges = np.quantile(self.rf_splits[col], q_values) elif self.strategy == "uniform": width = (max(self.rf_splits[col]) - min(self.rf_splits[col])) / b bin_edges = width * \ np.arange(0, b + 1) + min(self.rf_splits[col]) self.bin_edges_[col] = bin_edges if self.encode == 'onehot': discretized_df[col] = self._discretize_to_bins( X[col], bin_edges) # fit onehot encoded X if specified if self.encode == "onehot": onehot = OneHotEncoder(drop=self.onehot_drop) # , sparse=False) onehot.fit(discretized_df[self.dcols_].astype(str)) self.onehot_ = onehot return self def transform(self, X): """ Discretize the data. Parameters ---------- X : data frame of shape (n_samples, n_features) Data to be discretized. Returns ------- X_discretized : data frame Data with features in dcols transformed to the binned space. All other features remain unchanged. """ check_is_fitted(self) # transform features that did not appear in RF if len(self.missing_rf_cols_) > 0: discretized_df = self.backup_discretizer_.transform( X[self.missing_rf_cols_]) discretized_df = pd.DataFrame(discretized_df, columns=self.missing_rf_cols_, index=X.index) else: discretized_df = pd.DataFrame({}, index=X.index) # do discretization based on rf split thresholds for col in self.bin_edges_.keys(): discretized_df[col] = self._discretize_to_bins( X[col], self.bin_edges_[col]) # return onehot encoded data if specified and # join discretized columns with rest of X X_discretized = self._transform_postprocessing(discretized_df, X) return X_discretized
Ancestors
- AbstractDiscretizer
- sklearn.base.TransformerMixin
- sklearn.utils._set_output._SetOutputMixin
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
Methods
def fit(self, X, y=None)
-
Fit the estimator.
Parameters
X
:data frame
ofshape (n_samples, n_features)
- (Training) data to be discretized.
y
:array-like
ofshape (n_samples,)
- (Training) response vector. Required only if rf_model = None or rf_model has not yet been fitted
Returns
self
Expand source code
def fit(self, X, y=None): """ Fit the estimator. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. y : array-like of shape (n_samples,) (Training) response vector. Required only if rf_model = None or rf_model has not yet been fitted Returns ------- self """ # initialization and error checking self._fit_preprocessing(X) # get all random forest split points self._fit_rf(X=X, y=y) # features that were not used in the rf but need to be discretized self.missing_rf_cols_ = list(set(self.dcols_) - set(self.rf_splits.keys())) if len(self.missing_rf_cols_) > 0: print("{} did not appear in random forest so were discretized via {} discretization" .format(self.missing_rf_cols_, self.strategy)) missing_n_bins = np.array([self.n_bins[np.array(self.dcols_) == col][0] for col in self.missing_rf_cols_]) backup_discretizer = BasicDiscretizer(n_bins=missing_n_bins, dcols=self.missing_rf_cols_, encode='ordinal', strategy=self.backup_strategy) backup_discretizer.fit(X[self.missing_rf_cols_]) self.backup_discretizer_ = backup_discretizer else: self.backup_discretizer_ = None if self.encode == 'onehot': if len(self.missing_rf_cols_) > 0: discretized_df = backup_discretizer.transform( X[self.missing_rf_cols_]) else: discretized_df = pd.DataFrame({}, index=X.index) # do discretization based on rf split thresholds self.bin_edges_ = dict() for col in self.dcols_: if col in self.rf_splits.keys(): b = self.n_bins[np.array(self.dcols_) == col] if self.strategy == "quantile": q_values = np.linspace(0, 1, int(b) + 1) bin_edges = np.quantile(self.rf_splits[col], q_values) elif self.strategy == "uniform": width = (max(self.rf_splits[col]) - min(self.rf_splits[col])) / b bin_edges = width * \ np.arange(0, b + 1) + min(self.rf_splits[col]) self.bin_edges_[col] = bin_edges if self.encode == 'onehot': discretized_df[col] = self._discretize_to_bins( X[col], bin_edges) # fit onehot encoded X if specified if self.encode == "onehot": onehot = OneHotEncoder(drop=self.onehot_drop) # , sparse=False) onehot.fit(discretized_df[self.dcols_].astype(str)) self.onehot_ = onehot return self
def reweight_n_bins(self, X, y=None, by='nsplits')
-
Reallocate number of bins per feature.
Parameters
X
:data frame
ofshape (n_samples, n_features)
- (Training) data to be discretized.
y
:array-like
ofshape (n_samples,)
- (Training) response vector. Required only if rf_model = None or rf_model has not yet been fitted
by
:{'nsplits'}
, default='nsplits'
-
Specifies how to reallocate number of bins per feature.
nsplits Reallocate number of bins so that each feature in dcols get at a minimum of 2 bins with the remaining bins distributed proportionally to the number of RF splits using that feature
Returns
self.n_bins : array
ofshape (len(dcols),)
- number of bins per feature reallocated according to 'by' argument
Expand source code
def reweight_n_bins(self, X, y=None, by="nsplits"): """ Reallocate number of bins per feature. Parameters ---------- X : data frame of shape (n_samples, n_features) (Training) data to be discretized. y : array-like of shape (n_samples,) (Training) response vector. Required only if rf_model = None or rf_model has not yet been fitted by : {'nsplits'}, default='nsplits' Specifies how to reallocate number of bins per feature. nsplits Reallocate number of bins so that each feature in dcols get at a minimum of 2 bins with the remaining bins distributed proportionally to the number of RF splits using that feature Returns ------- self.n_bins : array of shape (len(dcols),) number of bins per feature reallocated according to 'by' argument """ # initialization and error checking self._fit_preprocessing(X) # get all random forest split points self._fit_rf(X=X, y=y) # get total number of bins to reallocate total_bins = self.n_bins.sum() # reweight n_bins if by == "nsplits": # each col gets at least 2 bins; remaining bins get # reallocated based on number of RF splits using that feature n_rules = np.array([len(self.rf_splits[col]) for col in self.dcols_]) self.n_bins = np.round(n_rules / n_rules.sum() * (total_bins - 2 * len(self.dcols_))) + 2 else: valid_by = ('nsplits') raise ValueError("Valid options for 'by' are {}. Got by={!r} instead." .format(valid_by, by))
def transform(self, X)
-
Discretize the data.
Parameters
X
:data frame
ofshape (n_samples, n_features)
- Data to be discretized.
Returns
X_discretized
:data frame
- Data with features in dcols transformed to the binned space. All other features remain unchanged.
Expand source code
def transform(self, X): """ Discretize the data. Parameters ---------- X : data frame of shape (n_samples, n_features) Data to be discretized. Returns ------- X_discretized : data frame Data with features in dcols transformed to the binned space. All other features remain unchanged. """ check_is_fitted(self) # transform features that did not appear in RF if len(self.missing_rf_cols_) > 0: discretized_df = self.backup_discretizer_.transform( X[self.missing_rf_cols_]) discretized_df = pd.DataFrame(discretized_df, columns=self.missing_rf_cols_, index=X.index) else: discretized_df = pd.DataFrame({}, index=X.index) # do discretization based on rf split thresholds for col in self.bin_edges_.keys(): discretized_df[col] = self._discretize_to_bins( X[col], self.bin_edges_[col]) # return onehot encoded data if specified and # join discretized columns with rest of X X_discretized = self._transform_postprocessing(discretized_df, X) return X_discretized