Expand source code
from collections import Counter
from typing import List, Mapping, Union, Optional

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from ..runner import run_models
from ..sklearnmodel import SklearnModel

ImportanceMap = Mapping[int, float]
ImportanceDistributionMap = Mapping[int, List[float]]


def feature_split_proportions(model: SklearnModel, columns: Optional[List[int]]=None) -> Mapping[int, float]:

    split_variables = []
    for sample in model.model_samples:
        for tree in sample.trees:
            for node in tree.nodes:
                splitting_var = node.split.splitting_variable
                split_variables.append(splitting_var)
    counter = Counter(split_variables)
    if columns is None:
        columns = sorted(list([x for x in counter.keys() if x is not None]))

    proportions = {}
    for column in columns:
        if column in counter.keys():
            proportions[column] = counter[column] / len(split_variables)
        else:
            proportions[column] = 0.0

    return proportions


def plot_feature_split_proportions(model: SklearnModel, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    proportions = feature_split_proportions(model)

    y_pos = np.arange(len(proportions))
    name, count = list(proportions.keys()), list(proportions.values())
    props = pd.DataFrame({"name": name, "counts": count}).sort_values("name", ascending=True)
    plt.barh(y_pos, props.counts, align='center', alpha=0.5)
    plt.yticks(y_pos, props.name)
    plt.xlabel('Proportion of all splits')
    plt.ylabel('Feature')
    plt.title('Proportion of Splits Made on Each Variable')
    return ax


def null_feature_split_proportions_distribution(model: SklearnModel,
                                                X: Union[pd.DataFrame, np.ndarray],
                                                y: np.ndarray,
                                                n_permutations: int=10) -> Mapping[int, List[float]]:
    """
    Calculate a null distribution of proportion of splits on each variable in X

    Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance

    Parameters
    ----------
    model: SklearnModel
        Model specification to work with
    X: np.ndarray
        Covariate matrix
    y: np.ndarray
        Target data
    n_permutations: int
        How many permutations to run
        The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run
    Returns
    -------
    Mapping[int, List[float]]
        A list of inclusion proportions for each variable in X
    """

    inclusion_dict = {x: [] for x in range(X.shape[1])}

    y_s = [np.random.permutation(y) for _ in range(n_permutations)]
    X_s = [X for _ in y_s]

    fit_models = run_models(model, X_s, y_s)

    for model in fit_models:
        splits_run = feature_split_proportions(model, list(range(X.shape[1])))
        for key, value in splits_run.items():
            inclusion_dict[key].append(value)

    return inclusion_dict


def plot_null_feature_importance_distributions(null_distributions: Mapping[int, List[float]], ax=None) -> None:
    if ax is None:
        _, ax = plt.subplots(1, 1)
    df = pd.DataFrame(null_distributions)
    df = pd.DataFrame(df.unstack()).reset_index().drop("level_1", axis=1)
    df.columns = ["variable", "p"]
    sns.boxplot(x="variable", y="p", data=df, ax=ax)
    ax.set_title("Null Feature Importance Distribution")
    return ax


def local_thresholds(null_distributions: ImportanceDistributionMap, percentile: float) -> Mapping[int, float]:
    """
    Calculate the required proportion of splits to be selected by variable

    Creates a null distribution for each variable based on the % of splits including that variable in each of the permuted models

    Each variable has its own threshold that is independent of the other variables

    Note - this is significantly less stringent than the global threshold

    Parameters
    ----------
    null_distributions: ImportanceDistributionMap
        A mapping from variable to distribution of split inclusion proportions under the null
    percentile: float
        The percentile of the null distribution to use as a cutoff.
        The closer to 1.0, the more stringent the threshold

    Returns
    -------
    Mapping[int, float]
        A lookup from column to % inclusion threshold
    """
    return {feature: np.percentile(null_distributions[feature], percentile) for feature in null_distributions}


def global_thresholds(null_distributions: ImportanceDistributionMap, percentile: float) -> Mapping[int, float]:
    """
    Calculate the required proportion of splits to be selected by variable

    Creates a distribution of the _highest_ inclusion percentage of any variable in each of the permuted models
    Threshold is set as a percentile of this distribution

    All variables have the same threshold

    Note that this is significantly more stringent than the local threshold

    Parameters
    ----------
    null_distributions: ImportanceDistributionMap
        A mapping from variable to distribution of split inclusion proportions under the null
    percentile: float
        The percentile of the null distribution to use as a cutoff.
        The closer to 1.0, the more stringent the threshold

    Returns
    -------
    Mapping[int, float]
        A lookup from column to % inclusion threshold
    """
    q_s = []
    df = pd.DataFrame(null_distributions)
    for row in df.iter_rows():
        q_s.append(np.max(row))
    threshold = np.percentile(q_s, percentile)
    return {feature: threshold for feature in null_distributions}


def kept_features(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) -> List[int]:
    """
    Extract the features to keep

    Parameters
    ----------
    feature_proportions: Mapping[int, float]
        Lookup from variable to % of splits in the model that use that variable
    thresholds:  Mapping[int, float]
        Lookup from variable to required % of splits in the model to be kept

    Returns
    -------
    List[int]
        Variable selected for inclusion in the final model
    """
    return [x[0] for x in zip(sorted(feature_proportions.keys()), is_kept(feature_proportions, thresholds)) if x[1]]


def is_kept(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) -> List[bool]:
    """
    Determine whether each variable should be kept after selection

    Parameters
    ----------
    feature_proportions: Mapping[int, float]
        Lookup from variable to % of splits in the model that use that variable
    thresholds:  Mapping[int, float]
        Lookup from variable to required % of splits in the model to be kept

    Returns
    -------
    List[bool]
        An array of length equal to the width of the covariate matrix
        True if the variable should be kept, False otherwise
    """
    print(sorted(list(feature_proportions.keys())))
    return [feature_proportions[feature] > thresholds[feature] for feature in sorted(list(feature_proportions.keys()))]


def partition_into_passed_and_failed_features(feature_proportions, thresholds):
    kept = kept_features(feature_proportions, thresholds)
    passed_features = {x[0]: x[1] for x in feature_proportions.items() if x[0] in kept}
    failed_features = {x[0]: x[1] for x in feature_proportions.items() if x[0] not in kept}
    return passed_features, failed_features


def plot_feature_proportions_against_thresholds(feature_proportions, thresholds, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    passed_features, failed_features = partition_into_passed_and_failed_features(feature_proportions, thresholds)

    ax.bar(thresholds.keys(), [x * 100 for x in thresholds.values()], width=0.01, color="black", alpha=0.5)
    ax.scatter(passed_features.keys(), [x * 100 for x in passed_features.values()], c="g")
    ax.scatter(failed_features.keys(), [x * 100 for x in failed_features.values()], c="r")
    ax.set_title("Feature Importance Compared to Threshold")
    ax.set_xlabel("Feature")
    ax.set_ylabel("% Splits")
    return ax

Functions

def feature_split_proportions(model: SklearnModel, columns: Optional[List[int]] = None) ‑> Mapping[int, float]
Expand source code
def feature_split_proportions(model: SklearnModel, columns: Optional[List[int]]=None) -> Mapping[int, float]:

    split_variables = []
    for sample in model.model_samples:
        for tree in sample.trees:
            for node in tree.nodes:
                splitting_var = node.split.splitting_variable
                split_variables.append(splitting_var)
    counter = Counter(split_variables)
    if columns is None:
        columns = sorted(list([x for x in counter.keys() if x is not None]))

    proportions = {}
    for column in columns:
        if column in counter.keys():
            proportions[column] = counter[column] / len(split_variables)
        else:
            proportions[column] = 0.0

    return proportions
def global_thresholds(null_distributions: Mapping[int, List[float]], percentile: float) ‑> Mapping[int, float]

Calculate the required proportion of splits to be selected by variable

Creates a distribution of the highest inclusion percentage of any variable in each of the permuted models Threshold is set as a percentile of this distribution

All variables have the same threshold

Note that this is significantly more stringent than the local threshold

Parameters

null_distributions : ImportanceDistributionMap
A mapping from variable to distribution of split inclusion proportions under the null
percentile : float
The percentile of the null distribution to use as a cutoff. The closer to 1.0, the more stringent the threshold

Returns

Mapping[int, float]
A lookup from column to % inclusion threshold
Expand source code
def global_thresholds(null_distributions: ImportanceDistributionMap, percentile: float) -> Mapping[int, float]:
    """
    Calculate the required proportion of splits to be selected by variable

    Creates a distribution of the _highest_ inclusion percentage of any variable in each of the permuted models
    Threshold is set as a percentile of this distribution

    All variables have the same threshold

    Note that this is significantly more stringent than the local threshold

    Parameters
    ----------
    null_distributions: ImportanceDistributionMap
        A mapping from variable to distribution of split inclusion proportions under the null
    percentile: float
        The percentile of the null distribution to use as a cutoff.
        The closer to 1.0, the more stringent the threshold

    Returns
    -------
    Mapping[int, float]
        A lookup from column to % inclusion threshold
    """
    q_s = []
    df = pd.DataFrame(null_distributions)
    for row in df.iter_rows():
        q_s.append(np.max(row))
    threshold = np.percentile(q_s, percentile)
    return {feature: threshold for feature in null_distributions}
def is_kept(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) ‑> List[bool]

Determine whether each variable should be kept after selection

Parameters

feature_proportions : Mapping[int, float]
Lookup from variable to % of splits in the model that use that variable
thresholds :  Mapping[int, float]
Lookup from variable to required % of splits in the model to be kept

Returns

List[bool]
An array of length equal to the width of the covariate matrix True if the variable should be kept, False otherwise
Expand source code
def is_kept(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) -> List[bool]:
    """
    Determine whether each variable should be kept after selection

    Parameters
    ----------
    feature_proportions: Mapping[int, float]
        Lookup from variable to % of splits in the model that use that variable
    thresholds:  Mapping[int, float]
        Lookup from variable to required % of splits in the model to be kept

    Returns
    -------
    List[bool]
        An array of length equal to the width of the covariate matrix
        True if the variable should be kept, False otherwise
    """
    print(sorted(list(feature_proportions.keys())))
    return [feature_proportions[feature] > thresholds[feature] for feature in sorted(list(feature_proportions.keys()))]
def kept_features(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) ‑> List[int]

Extract the features to keep

Parameters

feature_proportions : Mapping[int, float]
Lookup from variable to % of splits in the model that use that variable
thresholds :  Mapping[int, float]
Lookup from variable to required % of splits in the model to be kept

Returns

List[int]
Variable selected for inclusion in the final model
Expand source code
def kept_features(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) -> List[int]:
    """
    Extract the features to keep

    Parameters
    ----------
    feature_proportions: Mapping[int, float]
        Lookup from variable to % of splits in the model that use that variable
    thresholds:  Mapping[int, float]
        Lookup from variable to required % of splits in the model to be kept

    Returns
    -------
    List[int]
        Variable selected for inclusion in the final model
    """
    return [x[0] for x in zip(sorted(feature_proportions.keys()), is_kept(feature_proportions, thresholds)) if x[1]]
def local_thresholds(null_distributions: Mapping[int, List[float]], percentile: float) ‑> Mapping[int, float]

Calculate the required proportion of splits to be selected by variable

Creates a null distribution for each variable based on the % of splits including that variable in each of the permuted models

Each variable has its own threshold that is independent of the other variables

Note - this is significantly less stringent than the global threshold

Parameters

null_distributions : ImportanceDistributionMap
A mapping from variable to distribution of split inclusion proportions under the null
percentile : float
The percentile of the null distribution to use as a cutoff. The closer to 1.0, the more stringent the threshold

Returns

Mapping[int, float]
A lookup from column to % inclusion threshold
Expand source code
def local_thresholds(null_distributions: ImportanceDistributionMap, percentile: float) -> Mapping[int, float]:
    """
    Calculate the required proportion of splits to be selected by variable

    Creates a null distribution for each variable based on the % of splits including that variable in each of the permuted models

    Each variable has its own threshold that is independent of the other variables

    Note - this is significantly less stringent than the global threshold

    Parameters
    ----------
    null_distributions: ImportanceDistributionMap
        A mapping from variable to distribution of split inclusion proportions under the null
    percentile: float
        The percentile of the null distribution to use as a cutoff.
        The closer to 1.0, the more stringent the threshold

    Returns
    -------
    Mapping[int, float]
        A lookup from column to % inclusion threshold
    """
    return {feature: np.percentile(null_distributions[feature], percentile) for feature in null_distributions}
def null_feature_split_proportions_distribution(model: SklearnModel, X: Union[numpy.ndarray, pandas.core.frame.DataFrame], y: numpy.ndarray, n_permutations: int = 10) ‑> Mapping[int, List[float]]

Calculate a null distribution of proportion of splits on each variable in X

Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance

Parameters

model : SklearnModel
Model specification to work with
X : np.ndarray
Covariate matrix
y : np.ndarray
Target data
n_permutations : int
How many permutations to run The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run

Returns

Mapping[int, List[float]]
A list of inclusion proportions for each variable in X
Expand source code
def null_feature_split_proportions_distribution(model: SklearnModel,
                                                X: Union[pd.DataFrame, np.ndarray],
                                                y: np.ndarray,
                                                n_permutations: int=10) -> Mapping[int, List[float]]:
    """
    Calculate a null distribution of proportion of splits on each variable in X

    Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance

    Parameters
    ----------
    model: SklearnModel
        Model specification to work with
    X: np.ndarray
        Covariate matrix
    y: np.ndarray
        Target data
    n_permutations: int
        How many permutations to run
        The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run
    Returns
    -------
    Mapping[int, List[float]]
        A list of inclusion proportions for each variable in X
    """

    inclusion_dict = {x: [] for x in range(X.shape[1])}

    y_s = [np.random.permutation(y) for _ in range(n_permutations)]
    X_s = [X for _ in y_s]

    fit_models = run_models(model, X_s, y_s)

    for model in fit_models:
        splits_run = feature_split_proportions(model, list(range(X.shape[1])))
        for key, value in splits_run.items():
            inclusion_dict[key].append(value)

    return inclusion_dict
def partition_into_passed_and_failed_features(feature_proportions, thresholds)
Expand source code
def partition_into_passed_and_failed_features(feature_proportions, thresholds):
    kept = kept_features(feature_proportions, thresholds)
    passed_features = {x[0]: x[1] for x in feature_proportions.items() if x[0] in kept}
    failed_features = {x[0]: x[1] for x in feature_proportions.items() if x[0] not in kept}
    return passed_features, failed_features
def plot_feature_proportions_against_thresholds(feature_proportions, thresholds, ax=None)
Expand source code
def plot_feature_proportions_against_thresholds(feature_proportions, thresholds, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    passed_features, failed_features = partition_into_passed_and_failed_features(feature_proportions, thresholds)

    ax.bar(thresholds.keys(), [x * 100 for x in thresholds.values()], width=0.01, color="black", alpha=0.5)
    ax.scatter(passed_features.keys(), [x * 100 for x in passed_features.values()], c="g")
    ax.scatter(failed_features.keys(), [x * 100 for x in failed_features.values()], c="r")
    ax.set_title("Feature Importance Compared to Threshold")
    ax.set_xlabel("Feature")
    ax.set_ylabel("% Splits")
    return ax
def plot_feature_split_proportions(model: SklearnModel, ax=None)
Expand source code
def plot_feature_split_proportions(model: SklearnModel, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    proportions = feature_split_proportions(model)

    y_pos = np.arange(len(proportions))
    name, count = list(proportions.keys()), list(proportions.values())
    props = pd.DataFrame({"name": name, "counts": count}).sort_values("name", ascending=True)
    plt.barh(y_pos, props.counts, align='center', alpha=0.5)
    plt.yticks(y_pos, props.name)
    plt.xlabel('Proportion of all splits')
    plt.ylabel('Feature')
    plt.title('Proportion of Splits Made on Each Variable')
    return ax
def plot_null_feature_importance_distributions(null_distributions: Mapping[int, List[float]], ax=None) ‑> None
Expand source code
def plot_null_feature_importance_distributions(null_distributions: Mapping[int, List[float]], ax=None) -> None:
    if ax is None:
        _, ax = plt.subplots(1, 1)
    df = pd.DataFrame(null_distributions)
    df = pd.DataFrame(df.unstack()).reset_index().drop("level_1", axis=1)
    df.columns = ["variable", "p"]
    sns.boxplot(x="variable", y="p", data=df, ax=ax)
    ax.set_title("Null Feature Importance Distribution")
    return ax