Expand source code
import itertools
from typing import Set, Tuple
import numpy as np
import pandas as pd
def make_rj(n=300, p=50):
"""Generates data according to the model in Radchenko & James, 2010
X_i ~ Unif([0,1]^p)
y = sqrt(0.5)[sum_{i=1}^5 f_i(x) + f_1(x)f_2(x) + f_1(x)f_3(x)] + N(0,1)
f_1(x) = x1, f_2(x) = (1+x2)^{-1}, f_3(x) = sin(x3), f_4(x) = e^x4, f_5(x) = x5^2
function withing the sum are normalized
Params
------
n (int): number of sample
p (int): number of features
Returns
-------
Tuple[np.array, np.array]: design matrix and label vector
"""
X = np.random.uniform(0, 1, size=(n, p))
f_1 = X[:, 0]
f_2 = (1 + X[:, 1]) ** (-1)
f_3 = np.sin(X[:, 2])
f_4 = np.exp(X[:, 3])
f_5 = X[:, 4] ** (2)
def _normalize_vec(v):
return (v - np.mean(v)) / np.std(v)
effects = _normalize_vec(f_1) + _normalize_vec(f_2) + _normalize_vec(f_3) + _normalize_vec(f_4) + _normalize_vec(
f_5)
interactions = f_1 * f_2 + f_1 * f_3
y = effects + interactions + np.random.normal(size=n)
return X, y
def make_vp(n=100, p=100):
"""Generates data according to https://arxiv.org/abs/1607.02670 (Sparse additive Gaussian process with soft interactions)
X_i ~ N(0, I)
y = x1 + x2^2 + x3 + x4^2 + x5 + x1x2 + x2x3 + x3x4 + N(0, 0.14)
Args:
n (int): number of sample
p (int): number of features
Returns:
Tuple[np.array, np.array]: design matrix and label vector
"""
X = np.random.normal(size=(n, p))
effects = X[:, 0] + X[:, 1] ** 2 + X[:, 2] + X[:, 3] ** 2 + X[:, 4]
interactions = X[:, 0] * X[:, 1] + X[:, 1] * X[:, 2] + X[:, 2] * X[:, 3]
y = effects + interactions + np.random.normal(scale=0.14, size=n)
return X, y
def get_gt(dataset_name):
important_features = []
interactions = []
if dataset_name == "friedman1":
important_features = [0, 1, 2, 3, 4]
interactions = [(0, 1)]
elif dataset_name == "radchenko_james":
important_features = [0, 1, 2, 3, 4]
interactions = [(0, 1), (0, 2)]
elif dataset_name == "vo_pati":
important_features = [0, 1, 2, 3, 4]
interactions = [(0, 1), (1, 2), (2, 3)]
return set(important_features), set(interactions)
def get_important_features(importance, k):
return set(np.argsort(importance)[0:k])
def get_interacting_features(interaction, k):
scores_list = []
for ind_1, ind_2 in itertools.combinations(range(interaction.shape[0]), 2):
scores_list.append([interaction[ind_1, ind_2], ind_1, ind_2])
df = pd.DataFrame(scores_list)
df = df.sort_values(0, ascending=False)
interactions = []
for i in range(k):
interactions.append((df.iloc[i, 1], df.iloc[i, 2]))
return set(interactions)
def interaction_fpr(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int):
if len(i_gt) == 0:
return
n_pairs = 0.5 * p * (p - 1)
n_non_interacting_pairs = (n_pairs - len(i_gt))
return len(i_hat.difference(i_gt)) / n_non_interacting_pairs
def interaction_tpr(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int):
if len(i_gt) == 0:
return
n_interactions = len(i_gt)
return len(i_hat.intersection(i_gt)) / n_interactions
def interaction_f1(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int):
if len(i_gt) == 0:
return
recall = len(i_gt.intersection(i_hat)) / len(i_gt)
precision = interaction_tpr(i_hat, i_gt, p)
if recall + precision == 0:
return 0
return 2 * ((precision * recall) / (precision + recall))
Functions
def get_gt(dataset_name)
-
Expand source code
def get_gt(dataset_name): important_features = [] interactions = [] if dataset_name == "friedman1": important_features = [0, 1, 2, 3, 4] interactions = [(0, 1)] elif dataset_name == "radchenko_james": important_features = [0, 1, 2, 3, 4] interactions = [(0, 1), (0, 2)] elif dataset_name == "vo_pati": important_features = [0, 1, 2, 3, 4] interactions = [(0, 1), (1, 2), (2, 3)] return set(important_features), set(interactions)
def get_important_features(importance, k)
-
Expand source code
def get_important_features(importance, k): return set(np.argsort(importance)[0:k])
def get_interacting_features(interaction, k)
-
Expand source code
def get_interacting_features(interaction, k): scores_list = [] for ind_1, ind_2 in itertools.combinations(range(interaction.shape[0]), 2): scores_list.append([interaction[ind_1, ind_2], ind_1, ind_2]) df = pd.DataFrame(scores_list) df = df.sort_values(0, ascending=False) interactions = [] for i in range(k): interactions.append((df.iloc[i, 1], df.iloc[i, 2])) return set(interactions)
def interaction_f1(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int)
-
Expand source code
def interaction_f1(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int): if len(i_gt) == 0: return recall = len(i_gt.intersection(i_hat)) / len(i_gt) precision = interaction_tpr(i_hat, i_gt, p) if recall + precision == 0: return 0 return 2 * ((precision * recall) / (precision + recall))
def interaction_fpr(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int)
-
Expand source code
def interaction_fpr(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int): if len(i_gt) == 0: return n_pairs = 0.5 * p * (p - 1) n_non_interacting_pairs = (n_pairs - len(i_gt)) return len(i_hat.difference(i_gt)) / n_non_interacting_pairs
def interaction_tpr(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int)
-
Expand source code
def interaction_tpr(i_gt: Set[Tuple], i_hat: Set[Tuple], p: int): if len(i_gt) == 0: return n_interactions = len(i_gt) return len(i_hat.intersection(i_gt)) / n_interactions
def make_rj(n=300, p=50)
-
Generates data according to the model in Radchenko & James, 2010 X_i ~ Unif([0,1]^p) y = sqrt(0.5)[sum_{i=1}^5 f_i(x) + f_1(x)f_2(x) + f_1(x)f_3(x)] + N(0,1) f_1(x) = x1, f_2(x) = (1+x2)^{-1}, f_3(x) = sin(x3), f_4(x) = e^x4, f_5(x) = x5^2 function withing the sum are normalized
Params
n (int): number of sample p (int): number of features
Returns
Tuple[np.array, np.array]: design matrix and label vector
Expand source code
def make_rj(n=300, p=50): """Generates data according to the model in Radchenko & James, 2010 X_i ~ Unif([0,1]^p) y = sqrt(0.5)[sum_{i=1}^5 f_i(x) + f_1(x)f_2(x) + f_1(x)f_3(x)] + N(0,1) f_1(x) = x1, f_2(x) = (1+x2)^{-1}, f_3(x) = sin(x3), f_4(x) = e^x4, f_5(x) = x5^2 function withing the sum are normalized Params ------ n (int): number of sample p (int): number of features Returns ------- Tuple[np.array, np.array]: design matrix and label vector """ X = np.random.uniform(0, 1, size=(n, p)) f_1 = X[:, 0] f_2 = (1 + X[:, 1]) ** (-1) f_3 = np.sin(X[:, 2]) f_4 = np.exp(X[:, 3]) f_5 = X[:, 4] ** (2) def _normalize_vec(v): return (v - np.mean(v)) / np.std(v) effects = _normalize_vec(f_1) + _normalize_vec(f_2) + _normalize_vec(f_3) + _normalize_vec(f_4) + _normalize_vec( f_5) interactions = f_1 * f_2 + f_1 * f_3 y = effects + interactions + np.random.normal(size=n) return X, y
def make_vp(n=100, p=100)
-
Generates data according to https://arxiv.org/abs/1607.02670 (Sparse additive Gaussian process with soft interactions) X_i ~ N(0, I) y = x1 + x2^2 + x3 + x4^2 + x5 + x1x2 + x2x3 + x3x4 + N(0, 0.14)
Args
n
:int
- number of sample
p
:int
- number of features
Returns
Tuple[np.array, np.array]
- design matrix and label vector
Expand source code
def make_vp(n=100, p=100): """Generates data according to https://arxiv.org/abs/1607.02670 (Sparse additive Gaussian process with soft interactions) X_i ~ N(0, I) y = x1 + x2^2 + x3 + x4^2 + x5 + x1x2 + x2x3 + x3x4 + N(0, 0.14) Args: n (int): number of sample p (int): number of features Returns: Tuple[np.array, np.array]: design matrix and label vector """ X = np.random.normal(size=(n, p)) effects = X[:, 0] + X[:, 1] ** 2 + X[:, 2] + X[:, 3] ** 2 + X[:, 4] interactions = X[:, 0] * X[:, 1] + X[:, 1] * X[:, 2] + X[:, 2] * X[:, 3] y = effects + interactions + np.random.normal(scale=0.14, size=n) return X, y