Expand source code

from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from sklearn.tree._tree import Tree
from imodels.tree.custom_greedy_tree import CustomDecisionTreeClassifier


def compute_tree_complexity(tree, complexity_measure='num_rules'):
    """Calculate number of non-leaf nodes
    """
    children_left = tree.children_left
    children_right = tree.children_right
    # num_split_nodes = 0
    complexity = 0
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]

        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            if complexity_measure == 'num_rules':
                complexity += 1
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            if complexity_measure != 'num_rules':
                complexity += 1
    return complexity


def _validate_feature_costs(feature_costs, n_features):
    if feature_costs is None:
        feature_costs = np.ones(n_features, dtype=np.float64)
    else:
        assert len(
            feature_costs) == n_features, f'{len(feature_costs)} != {n_features}'
        np.min(feature_costs) >= 0
    return feature_costs


def calculate_mean_depth_of_points_in_tree(tree, X, feature_costs=None):
    """Calculate the mean depth of each point in the tree.
    This is the average depth of the path from the root to the point.
    """
    feature_costs = _validate_feature_costs(
        feature_costs, n_features=X.shape[1])

    if isinstance(tree, CustomDecisionTreeClassifier):
        return _mean_depth_custom_tree(tree, X, feature_costs)
    elif hasattr(tree, 'tree_'):
        return _mean_depth_sklearn_tree(tree.tree_, X, feature_costs)
    elif hasattr(tree, 'estimator_'):
        return _mean_depth_sklearn_tree(tree.estimator_.tree_, X, feature_costs)
    else:
        return _mean_depth_coct_tree(tree, X, feature_costs)


def _mean_depth_custom_tree(custom_tree_, X, feature_costs):
    node = custom_tree_.root
    n_samples = []
    cum_costs = []
    is_leaves = []
    stack = [(node, 0)]
    while len(stack) > 0:
        node, cost = stack.pop()
        n_samples.append(node.num_samples)
        cum_costs.append(cost)
        is_leaves.append(node.left is None and node.right is None)

        if node.left:
            stack.append((node.left, cost + feature_costs[node.feature_index]))
        if node.right:
            stack.append(
                (node.right, cost + feature_costs[node.feature_index]))
    is_leaves = np.array(is_leaves)
    cum_costs = np.array(cum_costs)[is_leaves]
    n_samples = np.array(n_samples)[is_leaves]
    costs = cum_costs * n_samples / np.sum(n_samples)
    return np.sum(costs)


def _mean_depth_sklearn_tree(tree_, X, feature_costs):
    n_nodes = tree_.node_count
    children_left = tree_.children_left
    children_right = tree_.children_right

    # things to compute
    _node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    _is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    _cum_costs = np.zeros(shape=n_nodes, dtype=np.float64)

    # start with the root node id (0) and its depth (0) and its cost (0)
    stack = [(0, 0, 0)]
    while len(stack) > 0:
        node_id, depth, cost = stack.pop()
        _node_depth[node_id] = depth
        _cum_costs[node_id] = cost

        is_split_node = children_left[node_id] != children_right[node_id]

        cost += feature_costs[tree_.feature[node_id]]
        if is_split_node:
            stack.append((children_left[node_id], depth + 1, cost))
            stack.append((children_right[node_id], depth + 1, cost))
        else:
            _is_leaves[node_id] = True

    # iterate over leaves and calculate the number of samples in each of them
    n_samples = tree_.n_node_samples
    leaf_samples = n_samples[_is_leaves].astype(np.float64)
    depths = _cum_costs[_is_leaves] * leaf_samples / np.sum(leaf_samples)
    return np.sum(depths)


def _mean_depth_coct_tree(coct_tree, X, feature_costs):
    indicator = coct_tree.decision_path(X)[:, coct_tree.branch_nodes]
    feature_use_counts = indicator.sum(axis=0)
    n_branch_nodes = indicator.shape[1]
    node_idx_to_feature_cost = {
        i: feature_costs[coct_tree.feature_[i]] for i in range(n_branch_nodes)
    }
    cost = sum(
        [node_idx_to_feature_cost[i] * feature_use_counts[i]
         for i in range(n_branch_nodes)]
    ) / len(X)
    return cost


def calculate_mean_unique_calls_in_ensemble(ensemble, X, feature_costs=None):
    '''Calculate the mean number of unique calls in the ensemble.
    '''
    if X is None:
        # Should pass X, this is just for testing
        n_features_in = ensemble.n_features_in_
        X = np.random.randint(2, size=(100, n_features_in))

    if feature_costs is None:
        feature_costs = np.ones(n_features_in, dtype=np.float64)
    else:
        assert len(
            feature_costs) == n_features_in, f'{len(feature_costs)} != {n_features_in}'
        np.min(feature_costs) >= 0

    # extract the decision path for each sample
    ests = ensemble.estimators_.flatten()
    feats = [set() for _ in range(len(X))]
    for i in range(len(ests)):
        est = ests[i]
        node_index = est.decision_path(X).toarray()
        feats_est = [
            set([est.tree_.feature[x] for x in np.nonzero(row)[0]])
            for row in node_index
        ]
        for j in range(len(feats)):
            feats[j] = feats[j].union(feats_est[j])
    # -1 for the -2 feature that is always present
    return np.mean([len(f) - 1 for f in feats])


def compute_mean_llm_calls(model_name, num_prompts, model=None, X=None):
    if model_name == "manual_tree":
        return calculate_mean_depth_of_points_in_tree(model.tree_)
    elif model_name == "manual_hstree":
        return calculate_mean_depth_of_points_in_tree(model.estimator_.tree_)
    elif model_name == "manual_gbdt":
        return calculate_mean_unique_calls_in_ensemble(model, X)
    elif model_name == "manual_tree_cv":
        return calculate_mean_depth_of_points_in_tree(model.best_estimator_.tree_)
    elif model_name in ["manual_single_prompt"]:
        return 1
    elif model_name in ["manual_ensemble", "manual_boosting"]:
        return num_prompts
    else:
        return num_prompts


if __name__ == '__main__':
    X, y = datasets.fetch_california_housing(return_X_y=True)  # regression
    m = DecisionTreeRegressor(random_state=42, max_leaf_nodes=4)
    m.fit(X, y)
    print(compute_tree_complexity(m.tree_, complexity_measure='num_leaves'))

Functions

def calculate_mean_depth_of_points_in_tree(tree, X, feature_costs=None)

Calculate the mean depth of each point in the tree. This is the average depth of the path from the root to the point.

Expand source code

def calculate_mean_depth_of_points_in_tree(tree, X, feature_costs=None):
    """Calculate the mean depth of each point in the tree.
    This is the average depth of the path from the root to the point.
    """
    feature_costs = _validate_feature_costs(
        feature_costs, n_features=X.shape[1])

    if isinstance(tree, CustomDecisionTreeClassifier):
        return _mean_depth_custom_tree(tree, X, feature_costs)
    elif hasattr(tree, 'tree_'):
        return _mean_depth_sklearn_tree(tree.tree_, X, feature_costs)
    elif hasattr(tree, 'estimator_'):
        return _mean_depth_sklearn_tree(tree.estimator_.tree_, X, feature_costs)
    else:
        return _mean_depth_coct_tree(tree, X, feature_costs)

def calculate_mean_unique_calls_in_ensemble(ensemble, X, feature_costs=None)

Calculate the mean number of unique calls in the ensemble.

Expand source code

def calculate_mean_unique_calls_in_ensemble(ensemble, X, feature_costs=None):
    '''Calculate the mean number of unique calls in the ensemble.
    '''
    if X is None:
        # Should pass X, this is just for testing
        n_features_in = ensemble.n_features_in_
        X = np.random.randint(2, size=(100, n_features_in))

    if feature_costs is None:
        feature_costs = np.ones(n_features_in, dtype=np.float64)
    else:
        assert len(
            feature_costs) == n_features_in, f'{len(feature_costs)} != {n_features_in}'
        np.min(feature_costs) >= 0

    # extract the decision path for each sample
    ests = ensemble.estimators_.flatten()
    feats = [set() for _ in range(len(X))]
    for i in range(len(ests)):
        est = ests[i]
        node_index = est.decision_path(X).toarray()
        feats_est = [
            set([est.tree_.feature[x] for x in np.nonzero(row)[0]])
            for row in node_index
        ]
        for j in range(len(feats)):
            feats[j] = feats[j].union(feats_est[j])
    # -1 for the -2 feature that is always present
    return np.mean([len(f) - 1 for f in feats])

def compute_mean_llm_calls(model_name, num_prompts, model=None, X=None)

Expand source code

def compute_mean_llm_calls(model_name, num_prompts, model=None, X=None):
    if model_name == "manual_tree":
        return calculate_mean_depth_of_points_in_tree(model.tree_)
    elif model_name == "manual_hstree":
        return calculate_mean_depth_of_points_in_tree(model.estimator_.tree_)
    elif model_name == "manual_gbdt":
        return calculate_mean_unique_calls_in_ensemble(model, X)
    elif model_name == "manual_tree_cv":
        return calculate_mean_depth_of_points_in_tree(model.best_estimator_.tree_)
    elif model_name in ["manual_single_prompt"]:
        return 1
    elif model_name in ["manual_ensemble", "manual_boosting"]:
        return num_prompts
    else:
        return num_prompts

def compute_tree_complexity(tree, complexity_measure='num_rules')

Calculate number of non-leaf nodes

Expand source code

def compute_tree_complexity(tree, complexity_measure='num_rules'):
    """Calculate number of non-leaf nodes
    """
    children_left = tree.children_left
    children_right = tree.children_right
    # num_split_nodes = 0
    complexity = 0
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]

        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            if complexity_measure == 'num_rules':
                complexity += 1
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            if complexity_measure != 'num_rules':
                complexity += 1
    return complexity