Expand source code
from copy import deepcopy
from typing import List, Tuple, Union

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold

from ..runner import run_models
from ..sklearnmodel import SklearnModel


def original_model_rmse(model: SklearnModel,
                        X: Union[pd.DataFrame, np.ndarray],
                        y: np.ndarray,
                        n_k_fold_splits: int) -> List[float]:
    """
    Calculate the RMSE of the original model
    Used as a benchmark to compare against the null

    Parameters
    ----------
    model: SklearnModel
    X: np.ndarray
    y: np.ndarray
    n_k_fold_splits: int

    Returns
    -------
    List[float]
        List of the out of sample RMSEs for each fold of the covariate matrix
    """
    kf = KFold(n_k_fold_splits, shuffle=True)

    base_line_rmses = []

    for train_index, test_index in kf.split(X):
        model = deepcopy(model)
        model.fit(X[train_index], y[train_index])
        base_line_rmses.append(model.rmse(X[test_index], y[test_index]))

    return base_line_rmses


def null_rmse_distribution(model: SklearnModel,
                           X: Union[pd.DataFrame, np.ndarray],
                           y: np.ndarray,
                           variable: int,
                           n_k_fold_splits: int,
                           n_permutations: int=10) -> List[float]:
    """
    Calculate a null distribution on the RMSEs after scrambling a variable

    Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance

    RMSEs are calculated on out of sample data

    Parameters
    ----------
    model: SklearnModel
        Model specification to work with
    X: np.ndarray
        Covariate matrix
    y: np.ndarray
        Target data
    variable: int
        Which column of the covariate matrix to permute
    n_k_fold_splits: int
        How many K-fold splits to make of the data
    n_permutations: int
        How many permutations to run
        The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run
    Returns
    -------
    List[float]
        A list of predict set RMSEs - one entry for each fold of each permutation
    """

    kf = KFold(n_k_fold_splits, shuffle=True)

    permuted_train_X_s = []
    permuted_test_X_s = []
    train_y_s = []
    test_y_s = []

    for train_index, test_index in kf.split(X):
        for _ in range(n_permutations):
            permuted_X = deepcopy(X)
            permuted_X[:, variable] = np.random.permutation(permuted_X[:, variable])
            permuted_train_X_s.append(permuted_X[train_index])
            permuted_test_X_s.append(permuted_X[train_index])
            train_y_s.append(y[train_index])
            test_y_s.append(y[test_index])

    fit_models = run_models(model, permuted_train_X_s, train_y_s)

    rmses = []
    for i, m in enumerate(fit_models):
        rmses.append(m.rmse(permuted_test_X_s[i], test_y_s[i]))
    return rmses


def feature_importance(model: SklearnModel,
                       X: Union[pd.DataFrame, np.ndarray],
                       y: np.ndarray,
                       variable: int,
                       n_k_fold_splits: int=2,
                       n_permutations: int=10) -> Tuple[List[float], List[float]]:
    """
    Assess the importance to the RMSE of a single column of the covariate matrix

    Parameters
    ----------
    model: SklearnModel
        An instance of the model with the parameters to train with
        The model instance itself doesn't have to be trained
    X: np.ndarray
        Covariate matrix
    y: np.ndarray
        Target array
    variable: int
        Which column of the covariate matrix to assess
    n_k_fold_splits: int
        How many folds to take of the covariate matrix
    n_permutations: int
        How many runs of the model to make when generating the null distribution
        The more permutations, the better the approximation to the true null, but the more computation will be required

    Returns
    -------
    Tuple[List[float], List[float]]
        First entry is a List of the RMSEs of the original model
        Second entry is a list of RMSEs of the null distribution
    """
    original_model = original_model_rmse(model, X, y, n_k_fold_splits)
    null_distribution = null_rmse_distribution(model, X, y, variable, n_k_fold_splits, n_permutations)

    plt.hist(null_distribution, label="Null Distribution")
    plt.hist(original_model, label="Original Model")
    plt.title("RMSE of full model against null distribution for variable {}".format(variable))
    plt.xlabel("RMSE")
    plt.ylabel("Density")

    return original_model, null_distribution

Functions

def feature_importance(model: SklearnModel, X: Union[numpy.ndarray, pandas.core.frame.DataFrame], y: numpy.ndarray, variable: int, n_k_fold_splits: int = 2, n_permutations: int = 10) ‑> Tuple[List[float], List[float]]

Assess the importance to the RMSE of a single column of the covariate matrix

Parameters

model : SklearnModel
An instance of the model with the parameters to train with The model instance itself doesn't have to be trained
X : np.ndarray
Covariate matrix
y : np.ndarray
Target array
variable : int
Which column of the covariate matrix to assess
n_k_fold_splits : int
How many folds to take of the covariate matrix
n_permutations : int
How many runs of the model to make when generating the null distribution The more permutations, the better the approximation to the true null, but the more computation will be required

Returns

Tuple[List[float], List[float]]
First entry is a List of the RMSEs of the original model Second entry is a list of RMSEs of the null distribution
Expand source code
def feature_importance(model: SklearnModel,
                       X: Union[pd.DataFrame, np.ndarray],
                       y: np.ndarray,
                       variable: int,
                       n_k_fold_splits: int=2,
                       n_permutations: int=10) -> Tuple[List[float], List[float]]:
    """
    Assess the importance to the RMSE of a single column of the covariate matrix

    Parameters
    ----------
    model: SklearnModel
        An instance of the model with the parameters to train with
        The model instance itself doesn't have to be trained
    X: np.ndarray
        Covariate matrix
    y: np.ndarray
        Target array
    variable: int
        Which column of the covariate matrix to assess
    n_k_fold_splits: int
        How many folds to take of the covariate matrix
    n_permutations: int
        How many runs of the model to make when generating the null distribution
        The more permutations, the better the approximation to the true null, but the more computation will be required

    Returns
    -------
    Tuple[List[float], List[float]]
        First entry is a List of the RMSEs of the original model
        Second entry is a list of RMSEs of the null distribution
    """
    original_model = original_model_rmse(model, X, y, n_k_fold_splits)
    null_distribution = null_rmse_distribution(model, X, y, variable, n_k_fold_splits, n_permutations)

    plt.hist(null_distribution, label="Null Distribution")
    plt.hist(original_model, label="Original Model")
    plt.title("RMSE of full model against null distribution for variable {}".format(variable))
    plt.xlabel("RMSE")
    plt.ylabel("Density")

    return original_model, null_distribution
def null_rmse_distribution(model: SklearnModel, X: Union[numpy.ndarray, pandas.core.frame.DataFrame], y: numpy.ndarray, variable: int, n_k_fold_splits: int, n_permutations: int = 10) ‑> List[float]

Calculate a null distribution on the RMSEs after scrambling a variable

Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance

RMSEs are calculated on out of sample data

Parameters

model : SklearnModel
Model specification to work with
X : np.ndarray
Covariate matrix
y : np.ndarray
Target data
variable : int
Which column of the covariate matrix to permute
n_k_fold_splits : int
How many K-fold splits to make of the data
n_permutations : int
How many permutations to run The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run

Returns

List[float]
A list of predict set RMSEs - one entry for each fold of each permutation
Expand source code
def null_rmse_distribution(model: SklearnModel,
                           X: Union[pd.DataFrame, np.ndarray],
                           y: np.ndarray,
                           variable: int,
                           n_k_fold_splits: int,
                           n_permutations: int=10) -> List[float]:
    """
    Calculate a null distribution on the RMSEs after scrambling a variable

    Works by randomly permuting y to remove any true dependence of y on X and calculating feature importance

    RMSEs are calculated on out of sample data

    Parameters
    ----------
    model: SklearnModel
        Model specification to work with
    X: np.ndarray
        Covariate matrix
    y: np.ndarray
        Target data
    variable: int
        Which column of the covariate matrix to permute
    n_k_fold_splits: int
        How many K-fold splits to make of the data
    n_permutations: int
        How many permutations to run
        The higher the number of permutations, the more accurate the null distribution, but the longer it will take to run
    Returns
    -------
    List[float]
        A list of predict set RMSEs - one entry for each fold of each permutation
    """

    kf = KFold(n_k_fold_splits, shuffle=True)

    permuted_train_X_s = []
    permuted_test_X_s = []
    train_y_s = []
    test_y_s = []

    for train_index, test_index in kf.split(X):
        for _ in range(n_permutations):
            permuted_X = deepcopy(X)
            permuted_X[:, variable] = np.random.permutation(permuted_X[:, variable])
            permuted_train_X_s.append(permuted_X[train_index])
            permuted_test_X_s.append(permuted_X[train_index])
            train_y_s.append(y[train_index])
            test_y_s.append(y[test_index])

    fit_models = run_models(model, permuted_train_X_s, train_y_s)

    rmses = []
    for i, m in enumerate(fit_models):
        rmses.append(m.rmse(permuted_test_X_s[i], test_y_s[i]))
    return rmses
def original_model_rmse(model: SklearnModel, X: Union[numpy.ndarray, pandas.core.frame.DataFrame], y: numpy.ndarray, n_k_fold_splits: int) ‑> List[float]

Calculate the RMSE of the original model Used as a benchmark to compare against the null

Parameters

model : SklearnModel
 
X : np.ndarray
 
y : np.ndarray
 
n_k_fold_splits : int
 

Returns

List[float]
List of the out of sample RMSEs for each fold of the covariate matrix
Expand source code
def original_model_rmse(model: SklearnModel,
                        X: Union[pd.DataFrame, np.ndarray],
                        y: np.ndarray,
                        n_k_fold_splits: int) -> List[float]:
    """
    Calculate the RMSE of the original model
    Used as a benchmark to compare against the null

    Parameters
    ----------
    model: SklearnModel
    X: np.ndarray
    y: np.ndarray
    n_k_fold_splits: int

    Returns
    -------
    List[float]
        List of the out of sample RMSEs for each fold of the covariate matrix
    """
    kf = KFold(n_k_fold_splits, shuffle=True)

    base_line_rmses = []

    for train_index, test_index in kf.split(X):
        model = deepcopy(model)
        model.fit(X[train_index], y[train_index])
        base_line_rmses.append(model.rmse(X[test_index], y[test_index]))

    return base_line_rmses