Expand source code
import math


def prettify(elem, level=0):
    i = "\n" + level * "  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        for e in elem:
            prettify(e, level + 1)
        if not e.tail or not e.tail.strip():
            e.tail = i
    if level and (not elem.tail or not elem.tail.strip()):
        elem.tail = i
    return elem


def is_numeric_feature(attr):
    for x in set(attr):
        if not x == "?":
            try:
                x = float(x)
                return isinstance(x, float)
            except ValueError:
                return False
    return True


def set_as_leaf_node(parent, y_str):
    num_max = 0
    for cat in set(y_str):
        num_cat = y_str.count(cat)
        if num_cat > num_max:
            num_max = num_cat
            most_cat = cat
    parent.text = most_cat


def entropy(x):
    ent = 0
    for k in set(x):
        p_i = float(x.count(k)) / len(x)
        ent = ent - p_i * math.log(p_i, 2)
    return ent


def gain_ratio(category, attr):
    s = 0
    cat = []
    att = []
    for i in range(len(attr)):
        if not attr[i] == "?":
            cat.append(category[i])
            att.append(attr[i])
    for i in set(att):
        p_i = float(att.count(i)) / len(att)
        cat_i = []
        for j in range(len(cat)):
            if att[j] == i:
                cat_i.append(cat[j])
        s = s + p_i * entropy(cat_i)
    gain = entropy(cat) - s
    ent_att = entropy(att)
    if ent_att == 0:
        return 0
    else:
        return gain / ent_att


def gain(category, attr):
    cats = []
    for i in range(len(attr)):
        if not attr[i] == "?":
            cats.append([float(attr[i]), category[i]])
    cats = sorted(cats, key=lambda x: x[0])

    cat = [cats[i][1] for i in range(len(cats))]
    att = [cats[i][0] for i in range(len(cats))]
    if len(set(att)) == 1:
        return 0
    else:
        gains = []
        div_point = []
        for i in range(1, len(cat)):
            if not att[i] == att[i - 1]:
                gains.append(entropy(cat[:i]) * float(i) / len(cat) + entropy(cat[i:]) * (1 - float(i) / len(cat)))
                div_point.append(i)
        gain = entropy(cat) - min(gains)

        p_1 = float(div_point[gains.index(min(gains))]) / len(cat)
        ent_attr = -p_1 * math.log(p_1, 2) - (1 - p_1) * math.log((1 - p_1), 2)
        return gain / ent_attr


def get_best_split(category, attr):
    cats = []
    for i in range(len(attr)):
        if not attr[i] == "?":
            cats.append([float(attr[i]), category[i]])
    cats = sorted(cats, key=lambda x: x[0])

    cat = [cats[i][1] for i in range(len(cats))]
    att = [cats[i][0] for i in range(len(cats))]
    gains = []
    split_point = []
    for i in range(1, len(cat)):
        if not att[i] == att[i - 1]:
            gains.append(entropy(cat[:i]) * float(i) / len(cat) + entropy(cat[i:]) * (1 - float(i) / len(cat)))
            split_point.append(i)
    return att[split_point[gains.index(min(gains))]]


def add(d1, d2):
    d = d1
    for i in d2:
        if d.has_key(i):
            d[i] = d[i] + d2[i]
        else:
            d[i] = d2[i]
    return d


def decision(root, obs, feature_names: list, p):
    if root.hasChildNodes():
        att_name = root.firstChild.nodeName
        if att_name == "#text":
            return decision(root.firstChild, obs, feature_names, p)
        else:
            att = obs[feature_names.index(att_name)]
            if att == "?":
                d = {}
                for child in root.childNodes:
                    d = add(d, decision(child, obs, feature_names, p * float(child.getAttribute("p"))))
                return d
            else:
                for child in root.childNodes:
                    if child.getAttribute("flag") == "m" and child.getAttribute("feature") == att or \
                            child.getAttribute("flag") == "l" and float(att) < float(child.getAttribute("feature")) or \
                            child.getAttribute("flag") == "r" and float(att) >= float(child.getAttribute("feature")):
                        return decision(child, obs, feature_names, p)
    else:
        return {root.nodeValue: root.parentNode.getAttribute('p')}

Functions

def add(d1, d2)
Expand source code
def add(d1, d2):
    d = d1
    for i in d2:
        if d.has_key(i):
            d[i] = d[i] + d2[i]
        else:
            d[i] = d2[i]
    return d
def decision(root, obs, feature_names: list, p)
Expand source code
def decision(root, obs, feature_names: list, p):
    if root.hasChildNodes():
        att_name = root.firstChild.nodeName
        if att_name == "#text":
            return decision(root.firstChild, obs, feature_names, p)
        else:
            att = obs[feature_names.index(att_name)]
            if att == "?":
                d = {}
                for child in root.childNodes:
                    d = add(d, decision(child, obs, feature_names, p * float(child.getAttribute("p"))))
                return d
            else:
                for child in root.childNodes:
                    if child.getAttribute("flag") == "m" and child.getAttribute("feature") == att or \
                            child.getAttribute("flag") == "l" and float(att) < float(child.getAttribute("feature")) or \
                            child.getAttribute("flag") == "r" and float(att) >= float(child.getAttribute("feature")):
                        return decision(child, obs, feature_names, p)
    else:
        return {root.nodeValue: root.parentNode.getAttribute('p')}
def entropy(x)
Expand source code
def entropy(x):
    ent = 0
    for k in set(x):
        p_i = float(x.count(k)) / len(x)
        ent = ent - p_i * math.log(p_i, 2)
    return ent
def gain(category, attr)
Expand source code
def gain(category, attr):
    cats = []
    for i in range(len(attr)):
        if not attr[i] == "?":
            cats.append([float(attr[i]), category[i]])
    cats = sorted(cats, key=lambda x: x[0])

    cat = [cats[i][1] for i in range(len(cats))]
    att = [cats[i][0] for i in range(len(cats))]
    if len(set(att)) == 1:
        return 0
    else:
        gains = []
        div_point = []
        for i in range(1, len(cat)):
            if not att[i] == att[i - 1]:
                gains.append(entropy(cat[:i]) * float(i) / len(cat) + entropy(cat[i:]) * (1 - float(i) / len(cat)))
                div_point.append(i)
        gain = entropy(cat) - min(gains)

        p_1 = float(div_point[gains.index(min(gains))]) / len(cat)
        ent_attr = -p_1 * math.log(p_1, 2) - (1 - p_1) * math.log((1 - p_1), 2)
        return gain / ent_attr
def gain_ratio(category, attr)
Expand source code
def gain_ratio(category, attr):
    s = 0
    cat = []
    att = []
    for i in range(len(attr)):
        if not attr[i] == "?":
            cat.append(category[i])
            att.append(attr[i])
    for i in set(att):
        p_i = float(att.count(i)) / len(att)
        cat_i = []
        for j in range(len(cat)):
            if att[j] == i:
                cat_i.append(cat[j])
        s = s + p_i * entropy(cat_i)
    gain = entropy(cat) - s
    ent_att = entropy(att)
    if ent_att == 0:
        return 0
    else:
        return gain / ent_att
def get_best_split(category, attr)
Expand source code
def get_best_split(category, attr):
    cats = []
    for i in range(len(attr)):
        if not attr[i] == "?":
            cats.append([float(attr[i]), category[i]])
    cats = sorted(cats, key=lambda x: x[0])

    cat = [cats[i][1] for i in range(len(cats))]
    att = [cats[i][0] for i in range(len(cats))]
    gains = []
    split_point = []
    for i in range(1, len(cat)):
        if not att[i] == att[i - 1]:
            gains.append(entropy(cat[:i]) * float(i) / len(cat) + entropy(cat[i:]) * (1 - float(i) / len(cat)))
            split_point.append(i)
    return att[split_point[gains.index(min(gains))]]
def is_numeric_feature(attr)
Expand source code
def is_numeric_feature(attr):
    for x in set(attr):
        if not x == "?":
            try:
                x = float(x)
                return isinstance(x, float)
            except ValueError:
                return False
    return True
def prettify(elem, level=0)
Expand source code
def prettify(elem, level=0):
    i = "\n" + level * "  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        for e in elem:
            prettify(e, level + 1)
        if not e.tail or not e.tail.strip():
            e.tail = i
    if level and (not elem.tail or not elem.tail.strip()):
        elem.tail = i
    return elem
def set_as_leaf_node(parent, y_str)
Expand source code
def set_as_leaf_node(parent, y_str):
    num_max = 0
    for cat in set(y_str):
        num_cat = y_str.count(cat)
        if num_cat > num_max:
            num_max = num_cat
            most_cat = cat
    parent.text = most_cat