Expand source code
from collections import Counter
from typing import List, Mapping, Union, Optional

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from ..runner import run_models
from ..sklearnmodel import SklearnModel

ImportanceMap = Mapping[int, float]
ImportanceDistributionMap = Mapping[int, List[float]]

def feature_split_proportions(model: SklearnModel, columns: Optional[List[int]]=None) -> Mapping[int, float]:

    split_variables = []
    for sample in model.model_samples:
        for tree in sample.trees:
            for node in tree.nodes:
                splitting_var = node.split.splitting_variable
    counter = Counter(split_variables)
    if columns is None:
        columns = sorted(list([x for x in counter.keys() if x is not None]))

    proportions = {}
    for column in columns:
        if column in counter.keys():
            proportions[column] = counter[column] / len(split_variables)
            proportions[column] = 0.0

    return proportions

def plot_feature_split_proportions(model: SklearnModel, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    proportions = feature_split_proportions(model)

    y_pos = np.arange(len(proportions))
    name, count = list(proportions.keys()), list(proportions.values())
    props = pd.DataFrame({"name": name, "counts": count}).sort_values("name", ascending=True)
    plt.barh(y_pos, props.counts, align='center', alpha=0.5)
    plt.yticks(y_pos, props.name)
    plt.xlabel('Proportion of all splits')
    plt.title('Proportion of Splits Made on Each Variable')
    return ax

def null_feature_split_proportions_distribution(model: SklearnModel,
                                                X: Union[pd.DataFrame, np.ndarray],
                                                y: np.ndarray,
                                                n_permutations: int=10) -> Mapping[int, List[float]]:
    Calculate a null distribution of proportion of splits on each variable in X

    Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance

    model: SklearnModel
        Model specification to work with
    X: np.ndarray
        Covariate matrix
    y: np.ndarray
        Target data
    n_permutations: int
        How many permutations to run
        The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run
    Mapping[int, List[float]]
        A list of inclusion proportions for each variable in X

    inclusion_dict = {x: [] for x in range(X.shape[1])}

    y_s = [np.random.permutation(y) for _ in range(n_permutations)]
    X_s = [X for _ in y_s]

    fit_models = run_models(model, X_s, y_s)

    for model in fit_models:
        splits_run = feature_split_proportions(model, list(range(X.shape[1])))
        for key, value in splits_run.items():

    return inclusion_dict

def plot_null_feature_importance_distributions(null_distributions: Mapping[int, List[float]], ax=None) -> None:
    if ax is None:
        _, ax = plt.subplots(1, 1)
    df = pd.DataFrame(null_distributions)
    df = pd.DataFrame(df.unstack()).reset_index().drop("level_1", axis=1)
    df.columns = ["variable", "p"]
    sns.boxplot(x="variable", y="p", data=df, ax=ax)
    ax.set_title("Null Feature Importance Distribution")
    return ax

def local_thresholds(null_distributions: ImportanceDistributionMap, percentile: float) -> Mapping[int, float]:
    Calculate the required proportion of splits to be selected by variable

    Creates a null distribution for each variable based on the % of splits including that variable in each of the permuted models

    Each variable has its own threshold that is independent of the other variables

    Note - this is significantly less stringent than the global threshold

    null_distributions: ImportanceDistributionMap
        A mapping from variable to distribution of split inclusion proportions under the null
    percentile: float
        The percentile of the null distribution to use as a cutoff.
        The closer to 1.0, the more stringent the threshold

    Mapping[int, float]
        A lookup from column to % inclusion threshold
    return {feature: np.percentile(null_distributions[feature], percentile) for feature in null_distributions}

def global_thresholds(null_distributions: ImportanceDistributionMap, percentile: float) -> Mapping[int, float]:
    Calculate the required proportion of splits to be selected by variable

    Creates a distribution of the _highest_ inclusion percentage of any variable in each of the permuted models
    Threshold is set as a percentile of this distribution

    All variables have the same threshold

    Note that this is significantly more stringent than the local threshold

    null_distributions: ImportanceDistributionMap
        A mapping from variable to distribution of split inclusion proportions under the null
    percentile: float
        The percentile of the null distribution to use as a cutoff.
        The closer to 1.0, the more stringent the threshold

    Mapping[int, float]
        A lookup from column to % inclusion threshold
    q_s = []
    df = pd.DataFrame(null_distributions)
    for row in df.iter_rows():
    threshold = np.percentile(q_s, percentile)
    return {feature: threshold for feature in null_distributions}

def kept_features(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) -> List[int]:
    Extract the features to keep

    feature_proportions: Mapping[int, float]
        Lookup from variable to % of splits in the model that use that variable
    thresholds:  Mapping[int, float]
        Lookup from variable to required % of splits in the model to be kept

        Variable selected for inclusion in the final model
    return [x[0] for x in zip(sorted(feature_proportions.keys()), is_kept(feature_proportions, thresholds)) if x[1]]

def is_kept(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) -> List[bool]:
    Determine whether each variable should be kept after selection

    feature_proportions: Mapping[int, float]
        Lookup from variable to % of splits in the model that use that variable
    thresholds:  Mapping[int, float]
        Lookup from variable to required % of splits in the model to be kept

        An array of length equal to the width of the covariate matrix
        True if the variable should be kept, False otherwise
    return [feature_proportions[feature] > thresholds[feature] for feature in sorted(list(feature_proportions.keys()))]

def partition_into_passed_and_failed_features(feature_proportions, thresholds):
    kept = kept_features(feature_proportions, thresholds)
    passed_features = {x[0]: x[1] for x in feature_proportions.items() if x[0] in kept}
    failed_features = {x[0]: x[1] for x in feature_proportions.items() if x[0] not in kept}
    return passed_features, failed_features

def plot_feature_proportions_against_thresholds(feature_proportions, thresholds, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    passed_features, failed_features = partition_into_passed_and_failed_features(feature_proportions, thresholds)

    ax.bar(thresholds.keys(), [x * 100 for x in thresholds.values()], width=0.01, color="black", alpha=0.5)
    ax.scatter(passed_features.keys(), [x * 100 for x in passed_features.values()], c="g")
    ax.scatter(failed_features.keys(), [x * 100 for x in failed_features.values()], c="r")
    ax.set_title("Feature Importance Compared to Threshold")
    ax.set_ylabel("% Splits")
    return ax


def feature_split_proportions(model: SklearnModel, columns: Optional[List[int]] = None) ‑> Mapping[int, float]
Expand source code
def feature_split_proportions(model: SklearnModel, columns: Optional[List[int]]=None) -> Mapping[int, float]:

    split_variables = []
    for sample in model.model_samples:
        for tree in sample.trees:
            for node in tree.nodes:
                splitting_var = node.split.splitting_variable
    counter = Counter(split_variables)
    if columns is None:
        columns = sorted(list([x for x in counter.keys() if x is not None]))

    proportions = {}
    for column in columns:
        if column in counter.keys():
            proportions[column] = counter[column] / len(split_variables)
            proportions[column] = 0.0

    return proportions
def global_thresholds(null_distributions: Mapping[int, List[float]], percentile: float) ‑> Mapping[int, float]

Calculate the required proportion of splits to be selected by variable

Creates a distribution of the highest inclusion percentage of any variable in each of the permuted models Threshold is set as a percentile of this distribution

All variables have the same threshold

Note that this is significantly more stringent than the local threshold


null_distributions : ImportanceDistributionMap
A mapping from variable to distribution of split inclusion proportions under the null
percentile : float
The percentile of the null distribution to use as a cutoff. The closer to 1.0, the more stringent the threshold


Mapping[int, float]
A lookup from column to % inclusion threshold
Expand source code
def global_thresholds(null_distributions: ImportanceDistributionMap, percentile: float) -> Mapping[int, float]:
    Calculate the required proportion of splits to be selected by variable

    Creates a distribution of the _highest_ inclusion percentage of any variable in each of the permuted models
    Threshold is set as a percentile of this distribution

    All variables have the same threshold

    Note that this is significantly more stringent than the local threshold

    null_distributions: ImportanceDistributionMap
        A mapping from variable to distribution of split inclusion proportions under the null
    percentile: float
        The percentile of the null distribution to use as a cutoff.
        The closer to 1.0, the more stringent the threshold

    Mapping[int, float]
        A lookup from column to % inclusion threshold
    q_s = []
    df = pd.DataFrame(null_distributions)
    for row in df.iter_rows():
    threshold = np.percentile(q_s, percentile)
    return {feature: threshold for feature in null_distributions}
def is_kept(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) ‑> List[bool]

Determine whether each variable should be kept after selection


feature_proportions : Mapping[int, float]
Lookup from variable to % of splits in the model that use that variable
thresholds :  Mapping[int, float]
Lookup from variable to required % of splits in the model to be kept


An array of length equal to the width of the covariate matrix True if the variable should be kept, False otherwise
Expand source code
def is_kept(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) -> List[bool]:
    Determine whether each variable should be kept after selection

    feature_proportions: Mapping[int, float]
        Lookup from variable to % of splits in the model that use that variable
    thresholds:  Mapping[int, float]
        Lookup from variable to required % of splits in the model to be kept

        An array of length equal to the width of the covariate matrix
        True if the variable should be kept, False otherwise
    return [feature_proportions[feature] > thresholds[feature] for feature in sorted(list(feature_proportions.keys()))]
def kept_features(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) ‑> List[int]

Extract the features to keep


feature_proportions : Mapping[int, float]
Lookup from variable to % of splits in the model that use that variable
thresholds :  Mapping[int, float]
Lookup from variable to required % of splits in the model to be kept


Variable selected for inclusion in the final model
Expand source code
def kept_features(feature_proportions: Mapping[int, float], thresholds: Mapping[int, float]) -> List[int]:
    Extract the features to keep

    feature_proportions: Mapping[int, float]
        Lookup from variable to % of splits in the model that use that variable
    thresholds:  Mapping[int, float]
        Lookup from variable to required % of splits in the model to be kept

        Variable selected for inclusion in the final model
    return [x[0] for x in zip(sorted(feature_proportions.keys()), is_kept(feature_proportions, thresholds)) if x[1]]
def local_thresholds(null_distributions: Mapping[int, List[float]], percentile: float) ‑> Mapping[int, float]

Calculate the required proportion of splits to be selected by variable

Creates a null distribution for each variable based on the % of splits including that variable in each of the permuted models

Each variable has its own threshold that is independent of the other variables

Note - this is significantly less stringent than the global threshold


null_distributions : ImportanceDistributionMap
A mapping from variable to distribution of split inclusion proportions under the null
percentile : float
The percentile of the null distribution to use as a cutoff. The closer to 1.0, the more stringent the threshold


Mapping[int, float]
A lookup from column to % inclusion threshold
Expand source code
def local_thresholds(null_distributions: ImportanceDistributionMap, percentile: float) -> Mapping[int, float]:
    Calculate the required proportion of splits to be selected by variable

    Creates a null distribution for each variable based on the % of splits including that variable in each of the permuted models

    Each variable has its own threshold that is independent of the other variables

    Note - this is significantly less stringent than the global threshold

    null_distributions: ImportanceDistributionMap
        A mapping from variable to distribution of split inclusion proportions under the null
    percentile: float
        The percentile of the null distribution to use as a cutoff.
        The closer to 1.0, the more stringent the threshold

    Mapping[int, float]
        A lookup from column to % inclusion threshold
    return {feature: np.percentile(null_distributions[feature], percentile) for feature in null_distributions}
def null_feature_split_proportions_distribution(model: SklearnModel, X: Union[numpy.ndarray, pandas.core.frame.DataFrame], y: numpy.ndarray, n_permutations: int = 10) ‑> Mapping[int, List[float]]

Calculate a null distribution of proportion of splits on each variable in X

Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance


model : SklearnModel
Model specification to work with
X : np.ndarray
Covariate matrix
y : np.ndarray
Target data
n_permutations : int
How many permutations to run The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run


Mapping[int, List[float]]
A list of inclusion proportions for each variable in X
Expand source code
def null_feature_split_proportions_distribution(model: SklearnModel,
                                                X: Union[pd.DataFrame, np.ndarray],
                                                y: np.ndarray,
                                                n_permutations: int=10) -> Mapping[int, List[float]]:
    Calculate a null distribution of proportion of splits on each variable in X

    Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance

    model: SklearnModel
        Model specification to work with
    X: np.ndarray
        Covariate matrix
    y: np.ndarray
        Target data
    n_permutations: int
        How many permutations to run
        The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run
    Mapping[int, List[float]]
        A list of inclusion proportions for each variable in X

    inclusion_dict = {x: [] for x in range(X.shape[1])}

    y_s = [np.random.permutation(y) for _ in range(n_permutations)]
    X_s = [X for _ in y_s]

    fit_models = run_models(model, X_s, y_s)

    for model in fit_models:
        splits_run = feature_split_proportions(model, list(range(X.shape[1])))
        for key, value in splits_run.items():

    return inclusion_dict
def partition_into_passed_and_failed_features(feature_proportions, thresholds)
Expand source code
def partition_into_passed_and_failed_features(feature_proportions, thresholds):
    kept = kept_features(feature_proportions, thresholds)
    passed_features = {x[0]: x[1] for x in feature_proportions.items() if x[0] in kept}
    failed_features = {x[0]: x[1] for x in feature_proportions.items() if x[0] not in kept}
    return passed_features, failed_features
def plot_feature_proportions_against_thresholds(feature_proportions, thresholds, ax=None)
Expand source code
def plot_feature_proportions_against_thresholds(feature_proportions, thresholds, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    passed_features, failed_features = partition_into_passed_and_failed_features(feature_proportions, thresholds)

    ax.bar(thresholds.keys(), [x * 100 for x in thresholds.values()], width=0.01, color="black", alpha=0.5)
    ax.scatter(passed_features.keys(), [x * 100 for x in passed_features.values()], c="g")
    ax.scatter(failed_features.keys(), [x * 100 for x in failed_features.values()], c="r")
    ax.set_title("Feature Importance Compared to Threshold")
    ax.set_ylabel("% Splits")
    return ax
def plot_feature_split_proportions(model: SklearnModel, ax=None)
Expand source code
def plot_feature_split_proportions(model: SklearnModel, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    proportions = feature_split_proportions(model)

    y_pos = np.arange(len(proportions))
    name, count = list(proportions.keys()), list(proportions.values())
    props = pd.DataFrame({"name": name, "counts": count}).sort_values("name", ascending=True)
    plt.barh(y_pos, props.counts, align='center', alpha=0.5)
    plt.yticks(y_pos, props.name)
    plt.xlabel('Proportion of all splits')
    plt.title('Proportion of Splits Made on Each Variable')
    return ax
def plot_null_feature_importance_distributions(null_distributions: Mapping[int, List[float]], ax=None) ‑> None
Expand source code
def plot_null_feature_importance_distributions(null_distributions: Mapping[int, List[float]], ax=None) -> None:
    if ax is None:
        _, ax = plt.subplots(1, 1)
    df = pd.DataFrame(null_distributions)
    df = pd.DataFrame(df.unstack()).reset_index().drop("level_1", axis=1)
    df.columns = ["variable", "p"]
    sns.boxplot(x="variable", y="p", data=df, ax=ax)
    ax.set_title("Null Feature Importance Distribution")
    return ax