Module imodelsx.augtree.utils

Expand source code
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.metrics import mean_squared_error
ROOT = 0
LEFT = 1
RIGHT = 2
NEG = 0
POS = 1


def clean_str(s):
    return s.lower().replace('/', '___').strip()


def impurity_mse(y):
    """Lower impurity (closer to 0) is better
    """
    y_mean = np.mean(y)
    return np.mean((y - y_mean)**2)


def impurity_gini(y):
    """Lower impurity (closer to 0) is better
    """
    return 1 - gini_binary(np.mean(y))


def impurity_entropy(y):
    """Lower impurity (closer to 0) is better
    """
    return 1 - entropy_binary(np.mean(y))


def mse_score(y_true, y_pred):
    """Lower is better
    """
    return -mean_squared_error(y_true, y_pred)


def gini_score(y_true, y_pred):
    """Purer (more accurate) is better
    """
    y_pred = y_pred.astype(bool)
    y_pred_sum = y_pred.sum()
    if y_pred_sum == 0 or y_pred_sum == y_pred.size:
        y_mean = y_true.mean()
    else:
        y_mean = y_true[y_pred].mean()
    return gini_binary(y_mean)


def entropy_score(y_true, y_pred):
    """Purer (more accurate) is better
    """
    y_pred = y_pred.astype(bool)
    y_pred_sum = y_pred.sum()
    if y_pred_sum == 0 or y_pred_sum == y_pred.size:
        y_mean = y_true.mean()
    else:
        y_mean = y_true[y_pred].mean()
    return entropy_binary(y_mean)


def gini_binary(y_mean: float) -> float:
    """Higher is better
    {0, 1} -> 1
    {0.5} -> 0.5
    """
    return y_mean**2 + (1 - y_mean)**2


def entropy_binary(y_mean: float) -> float:
    """Lower is better
    """
    return -y_mean * np.log2(y_mean) - (1 - y_mean) * np.log2(1 - y_mean)


def get_gini_impurity_reduction_from_sklearn_stump(m: DecisionTreeClassifier):
    """Calculate gini impurity reduction in first split of model m
    """
    gini_orig = m.tree_.impurity[ROOT]
    gini_left = m.tree_.impurity[LEFT]
    gini_right = m.tree_.impurity[RIGHT]
    frac_samples_left = m.tree_.n_node_samples[LEFT] / \
        m.tree_.n_node_samples[ROOT]
    frac_samples_right = m.tree_.n_node_samples[RIGHT] / \
        m.tree_.n_node_samples[ROOT]
    gini_reduction = gini_orig - \
        (frac_samples_left * gini_left + frac_samples_right * gini_right)
    return gini_reduction


def check_if_feature_contributes_positively_from_sklearn_stump(m: DecisionTreeClassifier):
    """Check if having the feature being positive makes the value increase or decrease
    """
    # look at value for first split of model
    return m.tree_.value[RIGHT][0, NEG] > m.tree_.value[LEFT][0, POS]


def get_spacy_tokenizer(convert_lower=True, use_stemming=False):
    return LLMTreeTokenizer(convert_lower, use_stemming)


class LLMTreeTokenizer:
    def __init__(self, convert_lower, use_stemming):
        from spacy.lang.en import English
        self.tok = English()
        self.convert_lower = convert_lower
        self.use_stemming = use_stemming
        if self.use_stemming:
            from nltk.stem.porter import PorterStemmer
            self.stemmer = PorterStemmer()

    def __call__(self, s):
        if self.convert_lower:
            s = s.lower()
        strs = [str(x) for x in self.tok(s)]
        if self.use_stemming:
            strs = [self.stemmer.stem(x) for x in strs]
        return strs

Functions

def check_if_feature_contributes_positively_from_sklearn_stump(m: sklearn.tree._classes.DecisionTreeClassifier)

Check if having the feature being positive makes the value increase or decrease

Expand source code
def check_if_feature_contributes_positively_from_sklearn_stump(m: DecisionTreeClassifier):
    """Check if having the feature being positive makes the value increase or decrease
    """
    # look at value for first split of model
    return m.tree_.value[RIGHT][0, NEG] > m.tree_.value[LEFT][0, POS]
def clean_str(s)
Expand source code
def clean_str(s):
    return s.lower().replace('/', '___').strip()
def entropy_binary(y_mean: float) ‑> float

Lower is better

Expand source code
def entropy_binary(y_mean: float) -> float:
    """Lower is better
    """
    return -y_mean * np.log2(y_mean) - (1 - y_mean) * np.log2(1 - y_mean)
def entropy_score(y_true, y_pred)

Purer (more accurate) is better

Expand source code
def entropy_score(y_true, y_pred):
    """Purer (more accurate) is better
    """
    y_pred = y_pred.astype(bool)
    y_pred_sum = y_pred.sum()
    if y_pred_sum == 0 or y_pred_sum == y_pred.size:
        y_mean = y_true.mean()
    else:
        y_mean = y_true[y_pred].mean()
    return entropy_binary(y_mean)
def get_gini_impurity_reduction_from_sklearn_stump(m: sklearn.tree._classes.DecisionTreeClassifier)

Calculate gini impurity reduction in first split of model m

Expand source code
def get_gini_impurity_reduction_from_sklearn_stump(m: DecisionTreeClassifier):
    """Calculate gini impurity reduction in first split of model m
    """
    gini_orig = m.tree_.impurity[ROOT]
    gini_left = m.tree_.impurity[LEFT]
    gini_right = m.tree_.impurity[RIGHT]
    frac_samples_left = m.tree_.n_node_samples[LEFT] / \
        m.tree_.n_node_samples[ROOT]
    frac_samples_right = m.tree_.n_node_samples[RIGHT] / \
        m.tree_.n_node_samples[ROOT]
    gini_reduction = gini_orig - \
        (frac_samples_left * gini_left + frac_samples_right * gini_right)
    return gini_reduction
def get_spacy_tokenizer(convert_lower=True, use_stemming=False)
Expand source code
def get_spacy_tokenizer(convert_lower=True, use_stemming=False):
    return LLMTreeTokenizer(convert_lower, use_stemming)
def gini_binary(y_mean: float) ‑> float

Higher is better {0, 1} -> 1 {0.5} -> 0.5

Expand source code
def gini_binary(y_mean: float) -> float:
    """Higher is better
    {0, 1} -> 1
    {0.5} -> 0.5
    """
    return y_mean**2 + (1 - y_mean)**2
def gini_score(y_true, y_pred)

Purer (more accurate) is better

Expand source code
def gini_score(y_true, y_pred):
    """Purer (more accurate) is better
    """
    y_pred = y_pred.astype(bool)
    y_pred_sum = y_pred.sum()
    if y_pred_sum == 0 or y_pred_sum == y_pred.size:
        y_mean = y_true.mean()
    else:
        y_mean = y_true[y_pred].mean()
    return gini_binary(y_mean)
def impurity_entropy(y)

Lower impurity (closer to 0) is better

Expand source code
def impurity_entropy(y):
    """Lower impurity (closer to 0) is better
    """
    return 1 - entropy_binary(np.mean(y))
def impurity_gini(y)

Lower impurity (closer to 0) is better

Expand source code
def impurity_gini(y):
    """Lower impurity (closer to 0) is better
    """
    return 1 - gini_binary(np.mean(y))
def impurity_mse(y)

Lower impurity (closer to 0) is better

Expand source code
def impurity_mse(y):
    """Lower impurity (closer to 0) is better
    """
    y_mean = np.mean(y)
    return np.mean((y - y_mean)**2)
def mse_score(y_true, y_pred)

Lower is better

Expand source code
def mse_score(y_true, y_pred):
    """Lower is better
    """
    return -mean_squared_error(y_true, y_pred)

Classes

class LLMTreeTokenizer (convert_lower, use_stemming)
Expand source code
class LLMTreeTokenizer:
    def __init__(self, convert_lower, use_stemming):
        from spacy.lang.en import English
        self.tok = English()
        self.convert_lower = convert_lower
        self.use_stemming = use_stemming
        if self.use_stemming:
            from nltk.stem.porter import PorterStemmer
            self.stemmer = PorterStemmer()

    def __call__(self, s):
        if self.convert_lower:
            s = s.lower()
        strs = [str(x) for x in self.tok(s)]
        if self.use_stemming:
            strs = [self.stemmer.stem(x) for x in strs]
        return strs