Expand source code

import pandas as pd
import numpy as np
from json import dumps, JSONEncoder
from numpy import array
from sklearn.metrics import accuracy_score, balanced_accuracy_score


# Supporting Override for Converting Numpy Types into Python Values
class NumpyEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NumpyEncoder, self).default(obj)


class TreeClassifier:
    """
    Unified representation of a tree classifier in Python

    This class accepts a dictionary representation of a tree classifier and decodes it into an
    interactive object.

    Additional support for encoding/decoding layer can be layers if the feature-space of the model
    differs from the feature space of the original data.
    """

    def __init__(self, source, encoder=None, X=None, y=None):

        # The classifier stored in a recursive dictionary structure
        self.source = source

        # Optional encoder / decoder unit to run before / after prediction
        self.encoder = encoder

        # Original training features and labels to fill in missing training loss values
        if X is not None and y is not None:
            self.__initialize_training_loss__(X, y)

    def __initialize_training_loss__(self, X, y):
        """
        Compares every prediction y_hat against the labels y, then incorporates the misprediction
        into the stored loss values

        This is used when parsing models from an algorithm that doesn't provide the training loss
        in the output
        """
        for node in self.__all_leaves__():
            node["loss"] = 0.0
        (n, m) = X.shape
        for i in range(n):
            node = self.__find_leaf__(X.values[i, :])
            label = y.values[i, -1]
            weight = 1 / n
            if node["prediction"] != label:
                node["loss"] += weight
        return

    def __find_leaf__(self, sample):
        """
        Returns
        ---
        the leaf by which this sample would be classified
        """
        nodes = [self.source]
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                return node
            else:
                value = sample[node["feature"]]
                reference = node["reference"]
                if node["relation"] == "==":
                    if value == reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                elif node["relation"] == ">=":
                    if value >= reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                elif node["relation"] == "<=":
                    if value <= reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                elif node["relation"] == ">":
                    if value > reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                elif node["relation"] == "<":
                    if value < reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                else:
                    raise "Unsupported relational operator {}".format(node["relation"])

    def __all_leaves__(self):
        """
        Returns
        ---
        list : a list of all leaves in this model
        """
        nodes = [self.source]
        leaf_list = []
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                leaf_list.append(node)
            else:
                nodes.append(node["true"])
                nodes.append(node["false"])
        return leaf_list

    def loss(self):
        """
        Returns
        ---
        real number : values between [0,1]
            the training loss of this model
        """
        return sum(node["loss"] for node in self.__all_leaves__())

    def classify(self, sample):
        """
        Parameters
        ---
        sample : array-like, shape = [m_features]
            a 1-by-m row representing each feature of a single sample

        Returns
        ---
        string : the prediction for a given sample and conditional probability (given the
            observations along the decision path) of it being correct
        """
        node = self.__find_leaf__(sample)
        return node["prediction"], 1 - node["loss"]

    def predict(self, X):
        """
        Requires
        ---
        the set of features used should be pre-encoding if an encoder is used

        Parameters
        ---
        X : matrix-like, shape = [n_samples by m_features]
            a matrix where each row is a sample to be predicted and each column is a feature to be
            used for prediction

        Returns
        ---
        array-like, shape = [n_samples by 1] : a column where each element is the prediction
            associated with each row
        """
        # Perform an encoding if an encoding unit is specified
        if self.encoder is not None:
            X = pd.DataFrame(self.encoder.encode(X.values[:, :]), columns=self.encoder.headers)

        predictions = []
        (n, m) = X.shape
        for i in range(n):
            prediction, _ = self.classify(X.values[i, :])
            predictions.append(prediction)
        return array(predictions)

    def confidence(self, X):
        """
        Requires
        ---
        the set of features used should be pre-encoding if an encoder is used

        Parameters
        ---
        X : matrix-like, shape = [n_samples by m_features]
            a matrix where each row is a sample to be predicted and each column is a feature to be
            used for prediction

        Returns
        ---
        array-like, shape = [n_samples by 1] : a column where each element is the conditional
            probability of each prediction (conditioned only on the features that were used in
            prediction)
        """
        if self.encoder is not None:
            X = pd.DataFrame(self.encoder.encode(X.values[:, :]), columns=self.encoder.headers)

        conditional_probabilities = []
        n = X.shape[0]
        for i in range(n):
            _, conditional_probability = self.classify(X.values[i, :])
            conditional_probabilities.append(conditional_probability)
        return array(conditional_probabilities)

    def error(self, X, y, weight=None):
        """
        Parameters
        ---
        X : matrix-like, shape = [n_samples by m_features]
            an n-by-m matrix of sample and their features
        y : array-like, shape = [n_samples by 1]
            an n-by-1 column of labels associated with each sample
        weight : real number
            an n-by-1 column of weights to apply to each sample's misclassification

        Returns
        ---
        real number : the inaccuracy produced by applying this model over the given dataset, with
            optionals for weighted inaccuracy
        """
        return 1 - self.score(X, y, weight=weight)

    def score(self, X, y, weight=None):
        """
        Parameters
        ---
        X : matrix-like, shape = [n_samples by m_features]
            an n-by-m matrix of sample and their features
        y : array-like, shape = [n_samples by 1]
            an n-by-1 column of labels associated with each sample
        weight : real number
            an n-by-1 column of weights to apply to each sample's misclassification

        Returns
        ---
        real number : the accuracy produced by applying this model over the given dataset, with
            optionals for weighted accuracy
        """
        y_hat = self.predict(X)
        if weight == "balanced":
            return balanced_accuracy_score(y, y_hat)
        else:
            return accuracy_score(y, y_hat, normalize=True, sample_weight=weight)

    def __len__(self):
        """
        Returns
        ---
        natural number : The number of terminal nodes present in this tree
        """
        return self.leaves()

    def leaves(self):
        """
        Returns
        ---
        natural number : The number of terminal nodes present in this tree
        """
        leaves_counter = 0
        nodes = [self.source]
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                leaves_counter += 1
            else:
                nodes.append(node["true"])
                nodes.append(node["false"])
        return leaves_counter

    def nodes(self):
        """
        Returns
        ---
        natural number : The number of nodes present in this tree
        """
        nodes_counter = 0
        nodes = [self.source]
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                nodes_counter += 1
            else:
                nodes_counter += 1
                nodes.append(node["true"])
                nodes.append(node["false"])
        return nodes_counter

    def features(self):
        """
        Returns
        ---
        set : A set of strings each describing the features used by this model
        """
        feature_set = set()
        nodes = [self.source]
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                continue
            else:
                feature_set.add(node["name"])
                nodes.append(node["true"])
                nodes.append(node["false"])
        return feature_set

    def encoded_features(self):
        """
        Returns
        ---
        natural number : The number of encoded features used by the supplied encoder to represent
            the data set
        """
        return len(self.encoder.headers) if self.encoder is not None else None

    def maximum_depth(self, node=None):
        """
        Returns
        ---
        natural number : the length of the longest decision path in this tree. A single-node tree
            will return 1.
        """
        if node is None:
            node = self.source
        if "prediction" in node:
            return 1
        else:
            return 1 + max(self.maximum_depth(node["true"]), self.maximum_depth(node["false"]))

    def __str__(self):
        """
        Returns
        ---
        string : pseudocode representing the logic of this classifier
        """
        cases = []
        for group in self.__groups__():
            predicates = []
            for name in sorted(group["rules"].keys()):
                domain = group["rules"][name]
                if domain["type"] == "Categorical":
                    if len(domain["positive"]) > 0:
                        predicates.append("{} = {}".format(name, list(domain["positive"])[0]))
                    elif len(domain["negative"]) > 0:
                        if len(domain["negative"]) > 1:
                            predicates.append("{} not in {{ {} }}".format(
                                name, ", ".join([str(v) for v in domain["negative"]])))
                        else:
                            predicates.append("{} != {}".format(
                                name, str(list(domain["negative"])[0])))
                    else:
                        raise "Invalid Rule"
                elif domain["type"] == "Numerical":
                    predicate = name
                    if domain["min"] != -float("INF"):
                        predicate = "{} <= ".format(domain["min"]) + predicate
                    if domain["max"] != float("INF"):
                        predicate = predicate + " < {}".format(domain["max"])
                    predicates.append(predicate)

            if len(predicates) == 0:
                condition = "if true then:"
            else:
                condition = "if {} then:".format(" and ".join(predicates))
            outcomes = []
            # for outcome, probability in group["distribution"].items():
            outcomes.append("    predicted {}: {}".format(group["name"], group["prediction"]))
            outcomes.append("    misclassification penalty: {}".format(round(group["loss"], 3)))
            outcomes.append("    complexity penalty: {}".format(round(group["complexity"], 3)))
            result = "\n".join(outcomes)
            cases.append("{}\n{}".format(condition, result))
        return "\n\nelse ".join(cases)

    def __repr__(self):
        """
        Returns
        ---
        dictionary : The recursive dictionary used to represent the model
        """
        return self.source

    def latex(self, node=None):
        """
        Note
        ---
        This method doesn't work well for label headers that contain underscores due to underscore
        being a reserved character in LaTeX

        Returns
        ---
        string : A LaTeX string representing the model
        """
        if node is None:
            node = self.source
        if "prediction" in node:
            if "name" in node:
                name = node["name"]
            else:
                name = "feature_{}".format(node["feature"])
            return "[ ${}$ [ ${}$ ] ]".format(name, node["prediction"])
        else:
            if "name" in node:
                if "=" in node["name"]:
                    name = "{}".format(node["name"])
                else:
                    name = "{} {} {}".format(node["name"], node["relation"], node["reference"])
            else:
                name = "feature_{} {} {}".format(
                    node["feature"], node["relation"], node["reference"])
            return (
                "[ ${}$ {} {} ]"
                    .format(name, self.latex(node["true"]), self.latex(node["false"]))
                    .replace("==", r" \eq ").replace(">=", r" \ge ").replace("<=", r" \le ")
            )

    def json(self):
        """
        Returns
        ---
        string : A JSON string representing the model
        """
        return dumps(self.source, cls=NumpyEncoder)

    def __groups__(self, node=None):
        """
        Parameters
        ---
        node : node within the tree from which to start
        Returns
        ---
        list : Object representation of each leaf for conversion to a case in an if-then-else
            statement
        """
        if node is None:
            node = self.source
        if "prediction" in node:
            node["rules"] = {}
            groups = [node]
            return groups
        else:
            if "name" in node:
                name = node["name"]
            else:
                name = "feature_{}".format(node["feature"])
            reference = node["reference"]
            groups = []
            for condition_result in ["true", "false"]:
                subtree = node[condition_result]
                for group in self.__groups__(subtree):

                    # For each group, add the corresponding rule
                    rules = group["rules"]
                    if name not in rules:
                        rules[name] = {}
                    rule = rules[name]
                    if node["relation"] == "==":
                        rule["type"] = "Categorical"
                        if "positive" not in rule:
                            rule["positive"] = set()
                        if "negative" not in rule:
                            rule["negative"] = set()
                        if condition_result == "true":
                            rule["positive"].add(reference)
                        elif condition_result == "false":
                            rule["negative"].add(reference)
                        else:
                            raise "OptimalSparseDecisionTree: Malformatted source {}".format(node)
                    elif node["relation"] == ">=":
                        rule["type"] = "Numerical"
                        if "max" not in rule:
                            rule["max"] = float("INF")
                        if "min" not in rule:
                            rule["min"] = -float("INF")
                        if condition_result == "true":
                            rule["min"] = max(reference, rule["min"])
                        elif condition_result == "false":
                            rule["max"] = min(reference, rule["max"])
                        else:
                            raise "OptimalSparseDecisionTree: Malformatted source {}".format(node)
                    else:
                        raise "Unsupported relational operator {}".format(node["relation"])

                    # Add the modified group to the group list
                    groups.append(group)
            return groups

Classes

class NumpyEncoder (*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)

Extensible JSON https://json.org encoder for Python data structures.

Supports the following objects and types by default:

To extend this to recognize other objects, subclass and implement a .default() method with another method that returns a serializable object for o if possible, otherwise it should call the superclass implementation (to raise TypeError).

Constructor for JSONEncoder, with sensible defaults.

If skipkeys is false, then it is a TypeError to attempt encoding of keys that are not str, int, float or None. If skipkeys is True, such items are simply skipped.

If ensure_ascii is true, the output is guaranteed to be str objects with all incoming non-ASCII characters escaped. If ensure_ascii is false, the output can contain non-ASCII characters.

If check_circular is true, then lists, dicts, and custom encoded objects will be checked for circular references during encoding to prevent an infinite recursion (which would cause an RecursionError). Otherwise, no such check takes place.

If allow_nan is true, then NaN, Infinity, and -Infinity will be encoded as such. This behavior is not JSON specification compliant, but is consistent with most JavaScript based encoders and decoders. Otherwise, it will be a ValueError to encode such floats.

If sort_keys is true, then the output of dictionaries will be sorted by key; this is useful for regression tests to ensure that JSON serializations can be compared on a day-to-day basis.

If indent is a non-negative integer, then JSON array elements and object members will be pretty-printed with that indent level. An indent level of 0 will only insert newlines. None is the most compact representation.

If specified, separators should be an (item_separator, key_separator) tuple. The default is (', ', ': ') if indent is None and (',', ': ') otherwise. To get the most compact JSON representation, you should specify (',', ':') to eliminate whitespace.

If specified, default is a function that gets called for objects that can't otherwise be serialized. It should return a JSON encodable version of the object or raise a TypeError.

Expand source code

class NumpyEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NumpyEncoder, self).default(obj)

Ancestors

json.encoder.JSONEncoder

Methods

def default(self, obj)

Implement this method in a subclass such that it returns a serializable object for o, or calls the base implementation (to raise a TypeError).

For example, to support arbitrary iterators, you could implement default like this::

def default(self, o):
    try:
        iterable = iter(o)
    except TypeError:
        pass
    else:
        return list(iterable)
    # Let the base class default method raise the TypeError
    return super().default(o)

Expand source code

def default(self, obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return super(NumpyEncoder, self).default(obj)

class TreeClassifier (source, encoder=None, X=None, y=None)

Unified representation of a tree classifier in Python

This class accepts a dictionary representation of a tree classifier and decodes it into an interactive object.

Additional support for encoding/decoding layer can be layers if the feature-space of the model differs from the feature space of the original data.

Expand source code

class TreeClassifier:
    """
    Unified representation of a tree classifier in Python

    This class accepts a dictionary representation of a tree classifier and decodes it into an
    interactive object.

    Additional support for encoding/decoding layer can be layers if the feature-space of the model
    differs from the feature space of the original data.
    """

    def __init__(self, source, encoder=None, X=None, y=None):

        # The classifier stored in a recursive dictionary structure
        self.source = source

        # Optional encoder / decoder unit to run before / after prediction
        self.encoder = encoder

        # Original training features and labels to fill in missing training loss values
        if X is not None and y is not None:
            self.__initialize_training_loss__(X, y)

    def __initialize_training_loss__(self, X, y):
        """
        Compares every prediction y_hat against the labels y, then incorporates the misprediction
        into the stored loss values

        This is used when parsing models from an algorithm that doesn't provide the training loss
        in the output
        """
        for node in self.__all_leaves__():
            node["loss"] = 0.0
        (n, m) = X.shape
        for i in range(n):
            node = self.__find_leaf__(X.values[i, :])
            label = y.values[i, -1]
            weight = 1 / n
            if node["prediction"] != label:
                node["loss"] += weight
        return

    def __find_leaf__(self, sample):
        """
        Returns
        ---
        the leaf by which this sample would be classified
        """
        nodes = [self.source]
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                return node
            else:
                value = sample[node["feature"]]
                reference = node["reference"]
                if node["relation"] == "==":
                    if value == reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                elif node["relation"] == ">=":
                    if value >= reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                elif node["relation"] == "<=":
                    if value <= reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                elif node["relation"] == ">":
                    if value > reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                elif node["relation"] == "<":
                    if value < reference:
                        nodes.append(node["true"])
                    else:
                        nodes.append(node["false"])
                else:
                    raise "Unsupported relational operator {}".format(node["relation"])

    def __all_leaves__(self):
        """
        Returns
        ---
        list : a list of all leaves in this model
        """
        nodes = [self.source]
        leaf_list = []
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                leaf_list.append(node)
            else:
                nodes.append(node["true"])
                nodes.append(node["false"])
        return leaf_list

    def loss(self):
        """
        Returns
        ---
        real number : values between [0,1]
            the training loss of this model
        """
        return sum(node["loss"] for node in self.__all_leaves__())

    def classify(self, sample):
        """
        Parameters
        ---
        sample : array-like, shape = [m_features]
            a 1-by-m row representing each feature of a single sample

        Returns
        ---
        string : the prediction for a given sample and conditional probability (given the
            observations along the decision path) of it being correct
        """
        node = self.__find_leaf__(sample)
        return node["prediction"], 1 - node["loss"]

    def predict(self, X):
        """
        Requires
        ---
        the set of features used should be pre-encoding if an encoder is used

        Parameters
        ---
        X : matrix-like, shape = [n_samples by m_features]
            a matrix where each row is a sample to be predicted and each column is a feature to be
            used for prediction

        Returns
        ---
        array-like, shape = [n_samples by 1] : a column where each element is the prediction
            associated with each row
        """
        # Perform an encoding if an encoding unit is specified
        if self.encoder is not None:
            X = pd.DataFrame(self.encoder.encode(X.values[:, :]), columns=self.encoder.headers)

        predictions = []
        (n, m) = X.shape
        for i in range(n):
            prediction, _ = self.classify(X.values[i, :])
            predictions.append(prediction)
        return array(predictions)

    def confidence(self, X):
        """
        Requires
        ---
        the set of features used should be pre-encoding if an encoder is used

        Parameters
        ---
        X : matrix-like, shape = [n_samples by m_features]
            a matrix where each row is a sample to be predicted and each column is a feature to be
            used for prediction

        Returns
        ---
        array-like, shape = [n_samples by 1] : a column where each element is the conditional
            probability of each prediction (conditioned only on the features that were used in
            prediction)
        """
        if self.encoder is not None:
            X = pd.DataFrame(self.encoder.encode(X.values[:, :]), columns=self.encoder.headers)

        conditional_probabilities = []
        n = X.shape[0]
        for i in range(n):
            _, conditional_probability = self.classify(X.values[i, :])
            conditional_probabilities.append(conditional_probability)
        return array(conditional_probabilities)

    def error(self, X, y, weight=None):
        """
        Parameters
        ---
        X : matrix-like, shape = [n_samples by m_features]
            an n-by-m matrix of sample and their features
        y : array-like, shape = [n_samples by 1]
            an n-by-1 column of labels associated with each sample
        weight : real number
            an n-by-1 column of weights to apply to each sample's misclassification

        Returns
        ---
        real number : the inaccuracy produced by applying this model over the given dataset, with
            optionals for weighted inaccuracy
        """
        return 1 - self.score(X, y, weight=weight)

    def score(self, X, y, weight=None):
        """
        Parameters
        ---
        X : matrix-like, shape = [n_samples by m_features]
            an n-by-m matrix of sample and their features
        y : array-like, shape = [n_samples by 1]
            an n-by-1 column of labels associated with each sample
        weight : real number
            an n-by-1 column of weights to apply to each sample's misclassification

        Returns
        ---
        real number : the accuracy produced by applying this model over the given dataset, with
            optionals for weighted accuracy
        """
        y_hat = self.predict(X)
        if weight == "balanced":
            return balanced_accuracy_score(y, y_hat)
        else:
            return accuracy_score(y, y_hat, normalize=True, sample_weight=weight)

    def __len__(self):
        """
        Returns
        ---
        natural number : The number of terminal nodes present in this tree
        """
        return self.leaves()

    def leaves(self):
        """
        Returns
        ---
        natural number : The number of terminal nodes present in this tree
        """
        leaves_counter = 0
        nodes = [self.source]
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                leaves_counter += 1
            else:
                nodes.append(node["true"])
                nodes.append(node["false"])
        return leaves_counter

    def nodes(self):
        """
        Returns
        ---
        natural number : The number of nodes present in this tree
        """
        nodes_counter = 0
        nodes = [self.source]
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                nodes_counter += 1
            else:
                nodes_counter += 1
                nodes.append(node["true"])
                nodes.append(node["false"])
        return nodes_counter

    def features(self):
        """
        Returns
        ---
        set : A set of strings each describing the features used by this model
        """
        feature_set = set()
        nodes = [self.source]
        while len(nodes) > 0:
            node = nodes.pop()
            if "prediction" in node:
                continue
            else:
                feature_set.add(node["name"])
                nodes.append(node["true"])
                nodes.append(node["false"])
        return feature_set

    def encoded_features(self):
        """
        Returns
        ---
        natural number : The number of encoded features used by the supplied encoder to represent
            the data set
        """
        return len(self.encoder.headers) if self.encoder is not None else None

    def maximum_depth(self, node=None):
        """
        Returns
        ---
        natural number : the length of the longest decision path in this tree. A single-node tree
            will return 1.
        """
        if node is None:
            node = self.source
        if "prediction" in node:
            return 1
        else:
            return 1 + max(self.maximum_depth(node["true"]), self.maximum_depth(node["false"]))

    def __str__(self):
        """
        Returns
        ---
        string : pseudocode representing the logic of this classifier
        """
        cases = []
        for group in self.__groups__():
            predicates = []
            for name in sorted(group["rules"].keys()):
                domain = group["rules"][name]
                if domain["type"] == "Categorical":
                    if len(domain["positive"]) > 0:
                        predicates.append("{} = {}".format(name, list(domain["positive"])[0]))
                    elif len(domain["negative"]) > 0:
                        if len(domain["negative"]) > 1:
                            predicates.append("{} not in {{ {} }}".format(
                                name, ", ".join([str(v) for v in domain["negative"]])))
                        else:
                            predicates.append("{} != {}".format(
                                name, str(list(domain["negative"])[0])))
                    else:
                        raise "Invalid Rule"
                elif domain["type"] == "Numerical":
                    predicate = name
                    if domain["min"] != -float("INF"):
                        predicate = "{} <= ".format(domain["min"]) + predicate
                    if domain["max"] != float("INF"):
                        predicate = predicate + " < {}".format(domain["max"])
                    predicates.append(predicate)

            if len(predicates) == 0:
                condition = "if true then:"
            else:
                condition = "if {} then:".format(" and ".join(predicates))
            outcomes = []
            # for outcome, probability in group["distribution"].items():
            outcomes.append("    predicted {}: {}".format(group["name"], group["prediction"]))
            outcomes.append("    misclassification penalty: {}".format(round(group["loss"], 3)))
            outcomes.append("    complexity penalty: {}".format(round(group["complexity"], 3)))
            result = "\n".join(outcomes)
            cases.append("{}\n{}".format(condition, result))
        return "\n\nelse ".join(cases)

    def __repr__(self):
        """
        Returns
        ---
        dictionary : The recursive dictionary used to represent the model
        """
        return self.source

    def latex(self, node=None):
        """
        Note
        ---
        This method doesn't work well for label headers that contain underscores due to underscore
        being a reserved character in LaTeX

        Returns
        ---
        string : A LaTeX string representing the model
        """
        if node is None:
            node = self.source
        if "prediction" in node:
            if "name" in node:
                name = node["name"]
            else:
                name = "feature_{}".format(node["feature"])
            return "[ ${}$ [ ${}$ ] ]".format(name, node["prediction"])
        else:
            if "name" in node:
                if "=" in node["name"]:
                    name = "{}".format(node["name"])
                else:
                    name = "{} {} {}".format(node["name"], node["relation"], node["reference"])
            else:
                name = "feature_{} {} {}".format(
                    node["feature"], node["relation"], node["reference"])
            return (
                "[ ${}$ {} {} ]"
                    .format(name, self.latex(node["true"]), self.latex(node["false"]))
                    .replace("==", r" \eq ").replace(">=", r" \ge ").replace("<=", r" \le ")
            )

    def json(self):
        """
        Returns
        ---
        string : A JSON string representing the model
        """
        return dumps(self.source, cls=NumpyEncoder)

    def __groups__(self, node=None):
        """
        Parameters
        ---
        node : node within the tree from which to start
        Returns
        ---
        list : Object representation of each leaf for conversion to a case in an if-then-else
            statement
        """
        if node is None:
            node = self.source
        if "prediction" in node:
            node["rules"] = {}
            groups = [node]
            return groups
        else:
            if "name" in node:
                name = node["name"]
            else:
                name = "feature_{}".format(node["feature"])
            reference = node["reference"]
            groups = []
            for condition_result in ["true", "false"]:
                subtree = node[condition_result]
                for group in self.__groups__(subtree):

                    # For each group, add the corresponding rule
                    rules = group["rules"]
                    if name not in rules:
                        rules[name] = {}
                    rule = rules[name]
                    if node["relation"] == "==":
                        rule["type"] = "Categorical"
                        if "positive" not in rule:
                            rule["positive"] = set()
                        if "negative" not in rule:
                            rule["negative"] = set()
                        if condition_result == "true":
                            rule["positive"].add(reference)
                        elif condition_result == "false":
                            rule["negative"].add(reference)
                        else:
                            raise "OptimalSparseDecisionTree: Malformatted source {}".format(node)
                    elif node["relation"] == ">=":
                        rule["type"] = "Numerical"
                        if "max" not in rule:
                            rule["max"] = float("INF")
                        if "min" not in rule:
                            rule["min"] = -float("INF")
                        if condition_result == "true":
                            rule["min"] = max(reference, rule["min"])
                        elif condition_result == "false":
                            rule["max"] = min(reference, rule["max"])
                        else:
                            raise "OptimalSparseDecisionTree: Malformatted source {}".format(node)
                    else:
                        raise "Unsupported relational operator {}".format(node["relation"])

                    # Add the modified group to the group list
                    groups.append(group)
            return groups

Methods

def classify(self, sample)

Parameters

sample : array-like, shape = [m_features]: a 1-by-m row representing each feature of a single sample

Returns

string : the prediction for a given sample and conditional probability (given the: observations along the decision path) of it being correct

Expand source code

def classify(self, sample):
    """
    Parameters
    ---
    sample : array-like, shape = [m_features]
        a 1-by-m row representing each feature of a single sample

    Returns
    ---
    string : the prediction for a given sample and conditional probability (given the
        observations along the decision path) of it being correct
    """
    node = self.__find_leaf__(sample)
    return node["prediction"], 1 - node["loss"]

def confidence(self, X)

Requires

the set of features used should be pre-encoding if an encoder is used

Parameters

X : matrix-like, shape = [n_samples by m_features]: a matrix where each row is a sample to be predicted and each column is a feature to be used for prediction

Returns

array-like, shape = [n_samples by 1] : a column where each element is the conditional: probability of each prediction (conditioned only on the features that were used in prediction)

Expand source code

def confidence(self, X):
    """
    Requires
    ---
    the set of features used should be pre-encoding if an encoder is used

    Parameters
    ---
    X : matrix-like, shape = [n_samples by m_features]
        a matrix where each row is a sample to be predicted and each column is a feature to be
        used for prediction

    Returns
    ---
    array-like, shape = [n_samples by 1] : a column where each element is the conditional
        probability of each prediction (conditioned only on the features that were used in
        prediction)
    """
    if self.encoder is not None:
        X = pd.DataFrame(self.encoder.encode(X.values[:, :]), columns=self.encoder.headers)

    conditional_probabilities = []
    n = X.shape[0]
    for i in range(n):
        _, conditional_probability = self.classify(X.values[i, :])
        conditional_probabilities.append(conditional_probability)
    return array(conditional_probabilities)

def encoded_features(self)

Returns

natural number : The number of encoded features used by the supplied encoder to represent: the data set

Expand source code

def encoded_features(self):
    """
    Returns
    ---
    natural number : The number of encoded features used by the supplied encoder to represent
        the data set
    """
    return len(self.encoder.headers) if self.encoder is not None else None

def error(self, X, y, weight=None)

Parameters

X : matrix-like, shape = [n_samples by m_features]: an n-by-m matrix of sample and their features
y : array-like, shape = [n_samples by 1]: an n-by-1 column of labels associated with each sample
weight : real number: an n-by-1 column of weights to apply to each sample's misclassification

Returns

real number : the inaccuracy produced by applying this model over the given dataset, with: optionals for weighted inaccuracy

Expand source code

def error(self, X, y, weight=None):
    """
    Parameters
    ---
    X : matrix-like, shape = [n_samples by m_features]
        an n-by-m matrix of sample and their features
    y : array-like, shape = [n_samples by 1]
        an n-by-1 column of labels associated with each sample
    weight : real number
        an n-by-1 column of weights to apply to each sample's misclassification

    Returns
    ---
    real number : the inaccuracy produced by applying this model over the given dataset, with
        optionals for weighted inaccuracy
    """
    return 1 - self.score(X, y, weight=weight)

def features(self)

Returns

set : A set of strings each describing the features used by this model

Expand source code

def features(self):
    """
    Returns
    ---
    set : A set of strings each describing the features used by this model
    """
    feature_set = set()
    nodes = [self.source]
    while len(nodes) > 0:
        node = nodes.pop()
        if "prediction" in node:
            continue
        else:
            feature_set.add(node["name"])
            nodes.append(node["true"])
            nodes.append(node["false"])
    return feature_set

def json(self)

Returns

string : A JSON string representing the model

Expand source code

def json(self):
    """
    Returns
    ---
    string : A JSON string representing the model
    """
    return dumps(self.source, cls=NumpyEncoder)

def latex(self, node=None)

Note

This method doesn't work well for label headers that contain underscores due to underscore being a reserved character in LaTeX

Returns

string : A LaTeX string representing the model

Expand source code

def latex(self, node=None):
    """
    Note
    ---
    This method doesn't work well for label headers that contain underscores due to underscore
    being a reserved character in LaTeX

    Returns
    ---
    string : A LaTeX string representing the model
    """
    if node is None:
        node = self.source
    if "prediction" in node:
        if "name" in node:
            name = node["name"]
        else:
            name = "feature_{}".format(node["feature"])
        return "[ ${}$ [ ${}$ ] ]".format(name, node["prediction"])
    else:
        if "name" in node:
            if "=" in node["name"]:
                name = "{}".format(node["name"])
            else:
                name = "{} {} {}".format(node["name"], node["relation"], node["reference"])
        else:
            name = "feature_{} {} {}".format(
                node["feature"], node["relation"], node["reference"])
        return (
            "[ ${}$ {} {} ]"
                .format(name, self.latex(node["true"]), self.latex(node["false"]))
                .replace("==", r" \eq ").replace(">=", r" \ge ").replace("<=", r" \le ")
        )

def leaves(self)

Returns

natural number : The number of terminal nodes present in this tree

Expand source code

def leaves(self):
    """
    Returns
    ---
    natural number : The number of terminal nodes present in this tree
    """
    leaves_counter = 0
    nodes = [self.source]
    while len(nodes) > 0:
        node = nodes.pop()
        if "prediction" in node:
            leaves_counter += 1
        else:
            nodes.append(node["true"])
            nodes.append(node["false"])
    return leaves_counter

def loss(self)

Returns

real number : values between [0,1]: the training loss of this model

Expand source code

def loss(self):
    """
    Returns
    ---
    real number : values between [0,1]
        the training loss of this model
    """
    return sum(node["loss"] for node in self.__all_leaves__())

def maximum_depth(self, node=None)

Returns

natural number : the length of the longest decision path in this tree. A single-node tree: will return 1.

Expand source code

def maximum_depth(self, node=None):
    """
    Returns
    ---
    natural number : the length of the longest decision path in this tree. A single-node tree
        will return 1.
    """
    if node is None:
        node = self.source
    if "prediction" in node:
        return 1
    else:
        return 1 + max(self.maximum_depth(node["true"]), self.maximum_depth(node["false"]))

def nodes(self)

Returns

natural number : The number of nodes present in this tree

Expand source code

def nodes(self):
    """
    Returns
    ---
    natural number : The number of nodes present in this tree
    """
    nodes_counter = 0
    nodes = [self.source]
    while len(nodes) > 0:
        node = nodes.pop()
        if "prediction" in node:
            nodes_counter += 1
        else:
            nodes_counter += 1
            nodes.append(node["true"])
            nodes.append(node["false"])
    return nodes_counter

def predict(self, X)

Requires

the set of features used should be pre-encoding if an encoder is used

Parameters

X : matrix-like, shape = [n_samples by m_features]: a matrix where each row is a sample to be predicted and each column is a feature to be used for prediction

Returns

array-like, shape = [n_samples by 1] : a column where each element is the prediction: associated with each row

Expand source code

def predict(self, X):
    """
    Requires
    ---
    the set of features used should be pre-encoding if an encoder is used

    Parameters
    ---
    X : matrix-like, shape = [n_samples by m_features]
        a matrix where each row is a sample to be predicted and each column is a feature to be
        used for prediction

    Returns
    ---
    array-like, shape = [n_samples by 1] : a column where each element is the prediction
        associated with each row
    """
    # Perform an encoding if an encoding unit is specified
    if self.encoder is not None:
        X = pd.DataFrame(self.encoder.encode(X.values[:, :]), columns=self.encoder.headers)

    predictions = []
    (n, m) = X.shape
    for i in range(n):
        prediction, _ = self.classify(X.values[i, :])
        predictions.append(prediction)
    return array(predictions)

def score(self, X, y, weight=None)

Parameters

X : matrix-like, shape = [n_samples by m_features]: an n-by-m matrix of sample and their features
y : array-like, shape = [n_samples by 1]: an n-by-1 column of labels associated with each sample
weight : real number: an n-by-1 column of weights to apply to each sample's misclassification

Returns

real number : the accuracy produced by applying this model over the given dataset, with: optionals for weighted accuracy

Expand source code

def score(self, X, y, weight=None):
    """
    Parameters
    ---
    X : matrix-like, shape = [n_samples by m_features]
        an n-by-m matrix of sample and their features
    y : array-like, shape = [n_samples by 1]
        an n-by-1 column of labels associated with each sample
    weight : real number
        an n-by-1 column of weights to apply to each sample's misclassification

    Returns
    ---
    real number : the accuracy produced by applying this model over the given dataset, with
        optionals for weighted accuracy
    """
    y_hat = self.predict(X)
    if weight == "balanced":
        return balanced_accuracy_score(y, y_hat)
    else:
        return accuracy_score(y, y_hat, normalize=True, sample_weight=weight)