Expand source code
``````from __future__ import division

__author__ = 'Victor Ruiz, vmr11@pitt.edu'

from math import log

import pandas as pd

def entropy(data_classes, base=2):
'''
Computes the entropy of a set of labels (class instantiations)
:param base: logarithm base for computation
:param data_classes: Series with labels of examples in a dataset
:return: value of entropy
'''
if not isinstance(data_classes, pd.core.series.Series):
raise AttributeError('input array should be a pandas series')
classes = data_classes.unique()
N = len(data_classes)
ent = 0  # initialize entropy

# iterate over classes
for c in classes:
partition = data_classes[data_classes == c]  # data with class = c
proportion = len(partition) / N
# update entropy
ent -= proportion * log(proportion, base)

return ent

def cut_point_information_gain(dataset, cut_point, feature_label, class_label):
'''
Return de information gain obtained by splitting a numeric attribute in two according to cut_point
:param dataset: pandas dataframe with a column for attribute values and a column for class
:param cut_point: threshold at which to partition the numeric attribute
:param feature_label: column label of the numeric attribute values in data
:param class_label: column label of the array of instance classes
:return: information gain of partition obtained by threshold cut_point
'''
if not isinstance(dataset, pd.core.frame.DataFrame):
raise AttributeError('input dataset should be a pandas data frame')

entropy_full = entropy(dataset[class_label])  # compute entropy of full dataset (w/o split)

# split data at cut_point
data_left = dataset[dataset[feature_label] <= cut_point]
data_right = dataset[dataset[feature_label] > cut_point]
(N, N_left, N_right) = (len(dataset), len(data_left), len(data_right))

gain = entropy_full - (N_left / N) * entropy(data_left[class_label]) - \
(N_right / N) * entropy(data_right[class_label])

return gain``````

## Functions

``` def cut_point_information_gain(dataset, cut_point, feature_label, class_label) ```

Return de information gain obtained by splitting a numeric attribute in two according to cut_point :param dataset: pandas dataframe with a column for attribute values and a column for class :param cut_point: threshold at which to partition the numeric attribute :param feature_label: column label of the numeric attribute values in data :param class_label: column label of the array of instance classes :return: information gain of partition obtained by threshold cut_point

Expand source code
``````def cut_point_information_gain(dataset, cut_point, feature_label, class_label):
'''
Return de information gain obtained by splitting a numeric attribute in two according to cut_point
:param dataset: pandas dataframe with a column for attribute values and a column for class
:param cut_point: threshold at which to partition the numeric attribute
:param feature_label: column label of the numeric attribute values in data
:param class_label: column label of the array of instance classes
:return: information gain of partition obtained by threshold cut_point
'''
if not isinstance(dataset, pd.core.frame.DataFrame):
raise AttributeError('input dataset should be a pandas data frame')

entropy_full = entropy(dataset[class_label])  # compute entropy of full dataset (w/o split)

# split data at cut_point
data_left = dataset[dataset[feature_label] <= cut_point]
data_right = dataset[dataset[feature_label] > cut_point]
(N, N_left, N_right) = (len(dataset), len(data_left), len(data_right))

gain = entropy_full - (N_left / N) * entropy(data_left[class_label]) - \
(N_right / N) * entropy(data_right[class_label])

return gain``````
``` def entropy(data_classes, base=2) ```

Computes the entropy of a set of labels (class instantiations) :param base: logarithm base for computation :param data_classes: Series with labels of examples in a dataset :return: value of entropy

Expand source code
``````def entropy(data_classes, base=2):
'''
Computes the entropy of a set of labels (class instantiations)
:param base: logarithm base for computation
:param data_classes: Series with labels of examples in a dataset
:return: value of entropy
'''
if not isinstance(data_classes, pd.core.series.Series):
raise AttributeError('input array should be a pandas series')
classes = data_classes.unique()
N = len(data_classes)
ent = 0  # initialize entropy

# iterate over classes
for c in classes:
partition = data_classes[data_classes == c]  # data with class = c
proportion = len(partition) / N
# update entropy
ent -= proportion * log(proportion, base)

return ent``````