Expand source code
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
class SimpleDiscretizer:
def __init__(self, n_bins: int = 8, strategy: str = 'uniform'):
self.n_bins = n_bins
self.strategy = strategy
def fit(self, X: np.array, feature_labels: np.array):
self.is_categorical = np.array([set(np.unique(X[:, i])).issubset({0, 1}) for i in np.arange(X.shape[1])])
if False not in self.is_categorical:
self.feature_labels = feature_labels
self.discretizer = None
return
if isinstance(feature_labels, list):
feature_labels = np.array(feature_labels)
# X_categorical = X[:, self.is_categorical]
X_categorical_columns = feature_labels[self.is_categorical]
# X_numeric = X[:, ~self.is_categorical]
X_numeric_columns = feature_labels[~self.is_categorical]
self.discretizer = KBinsDiscretizer(n_bins=self.n_bins, encode='onehot', strategy=self.strategy)
# X_numeric_discretized = self.discretizer.fit(X_numeric)
discretized_featnames = []
for feat_name, bin_edges in zip(X_numeric_columns, self.discretizer.bin_edges_):
be_str = bin_edges.astype(str)
discretized_featnames += (
[f'{feat_name}_' + '_to_'.join([be_str[i], be_str[i + 1]]) for i in range(bin_edges.shape[0] - 1)]
)
self.featnames_after_disc = np.append(discretized_featnames, X_categorical_columns)
def transform(self, X: np.array):
if self.discretizer is None:
return pd.DataFrame(X, columns=self.feature_labels)
X_categorical = X[:, self.is_categorical]
X_numeric = X[:, ~self.is_categorical]
X_numeric_discretized = self.discretizer.transform(X_numeric).toarray()
X_concat = np.concatenate((X_numeric_discretized, X_categorical), axis=1)
X_df_onehot = pd.DataFrame(X_concat, columns=self.featnames_after_disc)
return X_df_onehot
def fit_transform(self, X: np.array, feature_labels: np.array):
self.fit(X, feature_labels)
return self.transform(X)
Classes
class SimpleDiscretizer (n_bins: int = 8, strategy: str = 'uniform')
-
Expand source code
class SimpleDiscretizer: def __init__(self, n_bins: int = 8, strategy: str = 'uniform'): self.n_bins = n_bins self.strategy = strategy def fit(self, X: np.array, feature_labels: np.array): self.is_categorical = np.array([set(np.unique(X[:, i])).issubset({0, 1}) for i in np.arange(X.shape[1])]) if False not in self.is_categorical: self.feature_labels = feature_labels self.discretizer = None return if isinstance(feature_labels, list): feature_labels = np.array(feature_labels) # X_categorical = X[:, self.is_categorical] X_categorical_columns = feature_labels[self.is_categorical] # X_numeric = X[:, ~self.is_categorical] X_numeric_columns = feature_labels[~self.is_categorical] self.discretizer = KBinsDiscretizer(n_bins=self.n_bins, encode='onehot', strategy=self.strategy) # X_numeric_discretized = self.discretizer.fit(X_numeric) discretized_featnames = [] for feat_name, bin_edges in zip(X_numeric_columns, self.discretizer.bin_edges_): be_str = bin_edges.astype(str) discretized_featnames += ( [f'{feat_name}_' + '_to_'.join([be_str[i], be_str[i + 1]]) for i in range(bin_edges.shape[0] - 1)] ) self.featnames_after_disc = np.append(discretized_featnames, X_categorical_columns) def transform(self, X: np.array): if self.discretizer is None: return pd.DataFrame(X, columns=self.feature_labels) X_categorical = X[:, self.is_categorical] X_numeric = X[:, ~self.is_categorical] X_numeric_discretized = self.discretizer.transform(X_numeric).toarray() X_concat = np.concatenate((X_numeric_discretized, X_categorical), axis=1) X_df_onehot = pd.DataFrame(X_concat, columns=self.featnames_after_disc) return X_df_onehot def fit_transform(self, X: np.array, feature_labels: np.array): self.fit(X, feature_labels) return self.transform(X)
Methods
def fit(self, X:
, feature_labels: ) -
Expand source code
def fit(self, X: np.array, feature_labels: np.array): self.is_categorical = np.array([set(np.unique(X[:, i])).issubset({0, 1}) for i in np.arange(X.shape[1])]) if False not in self.is_categorical: self.feature_labels = feature_labels self.discretizer = None return if isinstance(feature_labels, list): feature_labels = np.array(feature_labels) # X_categorical = X[:, self.is_categorical] X_categorical_columns = feature_labels[self.is_categorical] # X_numeric = X[:, ~self.is_categorical] X_numeric_columns = feature_labels[~self.is_categorical] self.discretizer = KBinsDiscretizer(n_bins=self.n_bins, encode='onehot', strategy=self.strategy) # X_numeric_discretized = self.discretizer.fit(X_numeric) discretized_featnames = [] for feat_name, bin_edges in zip(X_numeric_columns, self.discretizer.bin_edges_): be_str = bin_edges.astype(str) discretized_featnames += ( [f'{feat_name}_' + '_to_'.join([be_str[i], be_str[i + 1]]) for i in range(bin_edges.shape[0] - 1)] ) self.featnames_after_disc = np.append(discretized_featnames, X_categorical_columns)
def fit_transform(self, X:
, feature_labels: ) -
Expand source code
def fit_transform(self, X: np.array, feature_labels: np.array): self.fit(X, feature_labels) return self.transform(X)
def transform(self, X:
) -
Expand source code
def transform(self, X: np.array): if self.discretizer is None: return pd.DataFrame(X, columns=self.feature_labels) X_categorical = X[:, self.is_categorical] X_numeric = X[:, ~self.is_categorical] X_numeric_discretized = self.discretizer.transform(X_numeric).toarray() X_concat = np.concatenate((X_numeric_discretized, X_categorical), axis=1) X_df_onehot = pd.DataFrame(X_concat, columns=self.featnames_after_disc) return X_df_onehot