Expand source code

from copy import deepcopy

import numpy as np
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator

try:
    from sklearn.feature_selection.base import SelectorMixin
except:
    from sklearn.feature_selection._base import SelectorMixin
from ..diagnostics.features import null_feature_split_proportions_distribution, \
    local_thresholds, global_thresholds, is_kept, feature_split_proportions, \
    plot_feature_proportions_against_thresholds, plot_null_feature_importance_distributions, \
    plot_feature_split_proportions
from ..sklearnmodel import SklearnModel


class SelectSplitProportionThreshold(BaseEstimator, SelectorMixin):

    def __init__(self,
                 model: SklearnModel,
                 percentile: float = 0.2):
        self.model = deepcopy(model)
        self.percentile = percentile

    def fit(self, X, y):
        self.model.fit(X, y)
        self.X, self.y = X, y
        self.feature_proportions = feature_split_proportions(self.model)
        return self

    def _get_support_mask(self):
        return np.array([proportion > self.percentile for proportion in self.feature_proportions.values()])

    def plot(self):
        plot_feature_split_proportions(self.model)
        plt.show()


class SelectNullDistributionThreshold(BaseEstimator, SelectorMixin):

    def __init__(self,
                 model: SklearnModel,
                 percentile: float = 0.95,
                 method="local",
                 n_permutations=10,
                 n_trees=None):
        if method == "local":
            self.method = local_thresholds
        elif method == "global":
            self.method = global_thresholds
        else:
            raise NotImplementedError(
                "Currently only local and global methods are supported, found {}".format(self.method))
        self.model = deepcopy(model)
        if n_trees is not None:
            self.model.n_trees = n_trees
        self.percentile = percentile
        self.n_permutations = n_permutations

    def fit(self, X, y):
        self.model.fit(X, y)
        self.X, self.y = X, y
        self.null_distribution = null_feature_split_proportions_distribution(self.model, X, y, self.n_permutations)
        self.thresholds = self.method(self.null_distribution, self.percentile)
        self.feature_proportions = feature_split_proportions(self.model)
        return self

    def _get_support_mask(self):
        return np.array(is_kept(self.feature_proportions, self.thresholds))

    def plot(self):
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 10))
        plot_feature_proportions_against_thresholds(self.feature_proportions, self.thresholds, ax1)
        plot_null_feature_importance_distributions(self.null_distribution, ax2)
        plt.show()

Classes

class SelectNullDistributionThreshold (model: SklearnModel, percentile: float = 0.95, method='local', n_permutations=10, n_trees=None)

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

Expand source code

class SelectNullDistributionThreshold(BaseEstimator, SelectorMixin):

    def __init__(self,
                 model: SklearnModel,
                 percentile: float = 0.95,
                 method="local",
                 n_permutations=10,
                 n_trees=None):
        if method == "local":
            self.method = local_thresholds
        elif method == "global":
            self.method = global_thresholds
        else:
            raise NotImplementedError(
                "Currently only local and global methods are supported, found {}".format(self.method))
        self.model = deepcopy(model)
        if n_trees is not None:
            self.model.n_trees = n_trees
        self.percentile = percentile
        self.n_permutations = n_permutations

    def fit(self, X, y):
        self.model.fit(X, y)
        self.X, self.y = X, y
        self.null_distribution = null_feature_split_proportions_distribution(self.model, X, y, self.n_permutations)
        self.thresholds = self.method(self.null_distribution, self.percentile)
        self.feature_proportions = feature_split_proportions(self.model)
        return self

    def _get_support_mask(self):
        return np.array(is_kept(self.feature_proportions, self.thresholds))

    def plot(self):
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 10))
        plot_feature_proportions_against_thresholds(self.feature_proportions, self.thresholds, ax1)
        plot_null_feature_importance_distributions(self.null_distribution, ax2)
        plt.show()

Ancestors

sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester
sklearn.feature_selection._base.SelectorMixin
sklearn.base.TransformerMixin
sklearn.utils._set_output._SetOutputMixin

Methods

def fit(self, X, y)

Expand source code

def fit(self, X, y):
    self.model.fit(X, y)
    self.X, self.y = X, y
    self.null_distribution = null_feature_split_proportions_distribution(self.model, X, y, self.n_permutations)
    self.thresholds = self.method(self.null_distribution, self.percentile)
    self.feature_proportions = feature_split_proportions(self.model)
    return self

def plot(self)

Expand source code

def plot(self):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 10))
    plot_feature_proportions_against_thresholds(self.feature_proportions, self.thresholds, ax1)
    plot_null_feature_importance_distributions(self.null_distribution, ax2)
    plt.show()

class SelectSplitProportionThreshold (model: SklearnModel, percentile: float = 0.2)

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

Expand source code

class SelectSplitProportionThreshold(BaseEstimator, SelectorMixin):

    def __init__(self,
                 model: SklearnModel,
                 percentile: float = 0.2):
        self.model = deepcopy(model)
        self.percentile = percentile

    def fit(self, X, y):
        self.model.fit(X, y)
        self.X, self.y = X, y
        self.feature_proportions = feature_split_proportions(self.model)
        return self

    def _get_support_mask(self):
        return np.array([proportion > self.percentile for proportion in self.feature_proportions.values()])

    def plot(self):
        plot_feature_split_proportions(self.model)
        plt.show()

Ancestors

sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester
sklearn.feature_selection._base.SelectorMixin
sklearn.base.TransformerMixin
sklearn.utils._set_output._SetOutputMixin

Methods

def fit(self, X, y)

Expand source code

def fit(self, X, y):
    self.model.fit(X, y)
    self.X, self.y = X, y
    self.feature_proportions = feature_split_proportions(self.model)
    return self

def plot(self)

Expand source code

def plot(self):
    plot_feature_split_proportions(self.model)
    plt.show()