Modified from https://github.com/RaczeQ/scikit-learn-C4.5-tree-classifier References
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning .. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
Expand source code
"""Modified from https://github.com/RaczeQ/scikit-learn-C4.5-tree-classifier
References
----------
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
.. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
"""
import copy
from copy import deepcopy
from typing import List
from xml.dom import minidom
from xml.etree import ElementTree as ET
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_val_score
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
from imodels.util.arguments import check_fit_arguments
from ..c45_tree.c45_utils import decision, is_numeric_feature, gain, gain_ratio, get_best_split, \
set_as_leaf_node
from ...util.data_util import get_clean_dataset
def _add_label(node, label):
if hasattr(node, "labels"):
node.labels.append(label)
return
node.labels = [label]
return
def _get_next_node(children, att):
for child in children:
is_equal = child.getAttribute("flag") == "m" and child.getAttribute("feature") == att
is_less_than = child.getAttribute("flag") == "l" and float(att) < float(child.getAttribute("feature"))
is_greater_than = child.getAttribute("flag") == "r" and float(att) >= float(child.getAttribute("feature"))
if is_equal or is_less_than or is_greater_than:
return child
def shrink_node(node, reg_param, parent_val, parent_num, cum_sum, scheme, constant):
"""Shrink the tree
"""
is_leaf = not node.hasChildNodes()
# if self.prediction_task == 'regression':
val = node.nodeValue
is_root = parent_val is None and parent_num is None
n_samples = len(node.labels) if (scheme != "leaf_based" or is_root) else parent_num
if is_root:
val_new = val
else:
reg_term = reg_param if scheme == "constant" else reg_param / parent_num
val_new = (val - parent_val) / (1 + reg_term)
cum_sum += val_new
if is_leaf:
if scheme == "leaf_based":
v = constant + (val - constant) / (1 + reg_param / node.n_obs)
node.nodeValue = v
else:
node.nodeValue = cum_sum
else:
for c in node.childNodes:
shrink_node(c, reg_param, val, parent_num=n_samples, cum_sum=cum_sum, scheme=scheme, constant=constant)
return node
def _add_label(node, label):
if hasattr(node, "labels"):
node.labels.append(label)
return
node.labels = [label]
return
def _get_next_node(children, att):
for child in children:
is_equal = child.getAttribute("flag") == "m" and child.getAttribute("feature") == att
is_less_than = child.getAttribute("flag") == "l" and float(att) < float(child.getAttribute("feature"))
is_greater_than = child.getAttribute("flag") == "r" and float(att) >= float(child.getAttribute("feature"))
if is_equal or is_less_than or is_greater_than:
return child
def shrink_node(node, reg_param, parent_val, parent_num, cum_sum, scheme, constant):
"""Shrink the tree
"""
is_leaf = not node.hasChildNodes()
# if self.prediction_task == 'regression':
val = node.nodeValue
is_root = parent_val is None and parent_num is None
n_samples = len(node.labels) if (scheme != "leaf_based" or is_root) else parent_num
if is_root:
val_new = val
else:
reg_term = reg_param if scheme == "constant" else reg_param / parent_num
val_new = (val - parent_val) / (1 + reg_term)
cum_sum += val_new
if is_leaf:
if scheme == "leaf_based":
v = constant + (val - constant) / (1 + reg_param / node.n_obs)
node.nodeValue = v
else:
node.nodeValue = cum_sum
else:
for c in node.childNodes:
shrink_node(c, reg_param, val, parent_num=n_samples, cum_sum=cum_sum, scheme=scheme, constant=constant)
return node
class C45TreeClassifier(BaseEstimator, ClassifierMixin):
"""A C4.5 tree classifier.
Parameters
----------
max_rules : int, optional (default=None)
Maximum number of split nodes allowed in the tree
"""
def __init__(self, max_rules: int = None):
super().__init__()
self.max_rules = max_rules
def fit(self, X, y, feature_names: str = None):
self.complexity_ = 0
# X, y = check_X_y(X, y)
X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
self.resultType = type(y[0])
if feature_names is None:
self.feature_names = [f'X_{x}' for x in range(X.shape[1])]
else:
# only include alphanumeric chars / replace spaces with underscores
self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_')
for x in feature_names]
self.feature_names = [
'X_' + x if x[0].isdigit()
else x
for x in self.feature_names
]
assert len(self.feature_names) == X.shape[1]
data = [[] for i in range(len(self.feature_names))]
categories = []
for i in range(len(X)):
categories.append(str(y[i]))
for j in range(len(self.feature_names)):
data[j].append(X[i][j])
root = ET.Element('GreedyTree')
self.grow_tree(data, categories, root, self.feature_names) # adds to root
self.tree_ = ET.tostring(root, encoding="unicode")
# print('self.tree_', self.tree_)
self.dom_ = minidom.parseString(self.tree_)
return self
def impute_nodes(self, X, y):
"""
Returns
---
the leaf by which this sample would be classified
"""
source_node = self.root
for i in range(len(y)):
sample, label = X[i, ...], y[i]
_add_label(source_node, label)
nodes = [source_node]
while len(nodes) > 0:
node = nodes.pop()
if not node.hasChildNodes():
continue
else:
att_name = node.firstChild.nodeName
if att_name != "#text":
att = sample[self.feature_names.index(att_name)]
next_node = _get_next_node(node.childNodes, att)
else:
next_node = node.firstChild
_add_label(next_node, label)
nodes.append(next_node)
self._calc_probs(source_node)
# self.dom_.childNodes[0] = source_node
# self.tree_.source = source_node
def _calc_probs(self, node):
node.nodeValue = np.mean(node.labels)
if not node.hasChildNodes():
return
for c in node.childNodes:
self._calc_probs(c)
def raw_preds(self, X):
check_is_fitted(self, ['tree_', 'resultType', 'feature_names'])
X = check_array(X)
if isinstance(X, pd.DataFrame):
X = deepcopy(X)
X.columns = self.feature_names
root = self.root
prediction = []
for i in range(X.shape[0]):
answerlist = decision(root, X[i], self.feature_names, 1)
answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True)
answer = answerlist[0][0]
# prediction.append(self.resultType(answer))
prediction.append(float(answer))
return np.array(prediction)
def predict(self, X):
raw_preds = self.raw_preds(X)
return (raw_preds > np.ones_like(raw_preds) * 0.5).astype(int)
def predict_proba(self, X):
raw_preds = self.raw_preds(X)
return np.vstack((1 - raw_preds, raw_preds)).transpose()
def __str__(self):
check_is_fitted(self, ['tree_'])
return self.dom_.toprettyxml(newl="\r\n")
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names):
"""
Parameters
----------
X_t: List[list]
input data transposed (num_features x num_observations)
y_str: List[str]
outcome represented as strings
parent
attrs_names
"""
# check that y contains more than 1 distinct value
if len(set(y_str)) > 1:
split = []
# loop over features and build up potential splits
for i in range(len(X_t)):
if set(X_t[i]) == set("?"):
split.append(0)
else:
if is_numeric_feature(X_t[i]):
split.append(gain(y_str, X_t[i]))
else:
split.append(gain_ratio(y_str, X_t[i]))
# no good split, return child node
if max(split) == 0:
set_as_leaf_node(parent, y_str)
# there is a good split
else:
index_selected = split.index(max(split))
name_selected = str(attrs_names[index_selected])
self.complexity_ += 1
if is_numeric_feature(X_t[index_selected]):
# split on this point
split_point = get_best_split(y_str, X_t[index_selected])
# build up children nodes
r_child_X = [[] for i in range(len(X_t))]
r_child_y = []
l_child_X = [[] for i in range(len(X_t))]
l_child_y = []
for i in range(len(y_str)):
if not X_t[index_selected][i] == "?":
if float(X_t[index_selected][i]) < float(split_point):
l_child_y.append(y_str[i])
for j in range(len(X_t)):
l_child_X[j].append(X_t[j][i])
else:
r_child_y.append(y_str[i])
for j in range(len(X_t)):
r_child_X[j].append(X_t[j][i])
# grow child nodes as well
if len(l_child_y) > 0 and len(r_child_y) > 0 and (
self.max_rules is None or
self.complexity_ <= self.max_rules
):
p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?"))
son = ET.SubElement(parent, name_selected,
{'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))})
self.grow_tree(l_child_X, l_child_y, son, attrs_names)
son = ET.SubElement(parent, name_selected,
{'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))})
self.grow_tree(r_child_X, r_child_y, son, attrs_names)
else:
num_max = 0
for cat in set(y_str):
num_cat = y_str.count(cat)
if num_cat > num_max:
num_max = num_cat
most_cat = cat
parent.text = most_cat
else:
# split on non-numeric variable (e.g. categorical)
# create a leaf for each unique value
for k in set(X_t[index_selected]):
if not k == "?" and (
self.max_rules is None or
self.complexity_ <= self.max_rules
):
child_X = [[] for i in range(len(X_t))]
child_y = []
for i in range(len(y_str)):
if X_t[index_selected][i] == k:
child_y.append(y_str[i])
for j in range(len(X_t)):
child_X[j].append(X_t[j][i])
son = ET.SubElement(parent, name_selected, {
'feature': k, "flag": "m",
'p': str(round(
float(len(child_y)) / (
len(X_t[index_selected]) - X_t[index_selected].count("?")),
3))})
self.grow_tree(child_X, child_y, son, attrs_names)
else:
parent.text = y_str[0]
@property
def root(self):
return self.dom_.childNodes[0]
class HSC45TreeClassifier(BaseEstimator):
def __init__(self, estimator_: C45TreeClassifier, reg_param: float = 1, shrinkage_scheme_: str = 'node_based'):
"""
Params
------
reg_param: float
Higher is more regularization (can be arbitrarily large, should not be < 0)
shrinkage_scheme: str
Experimental: Used to experiment with different forms of shrinkage. options are:
(i) node_based shrinks based on number of samples in parent node
(ii) leaf_based only shrinks leaf nodes based on number of leaf samples
(iii) constant shrinks every node by a constant lambda
"""
super().__init__()
self.reg_param = reg_param
# print('est', estimator_)
self.estimator_ = estimator_
self.shrinkage_scheme_ = shrinkage_scheme_
def _calc_probs(self, node):
self.estimator_._calc_probs(node)
def impute_nodes(self, X, y):
self.estimator_.impute_nodes(X, y)
def shrink_tree(self):
shrink_node(self.estimator_.root, self.reg_param, None, None, 0, self.shrinkage_scheme_, 0)
def predict_proba(self, X):
return self.estimator_.predict_proba(X)
def fit(self, *args, **kwargs):
X = kwargs['X'] if "X" in kwargs else args[0]
y = kwargs['y'] if "y" in kwargs else args[1]
if not hasattr(self.estimator_, "dom_"):
self.estimator_.fit(X, y)
self.impute_nodes(X, y)
self.shrink_tree()
def predict(self, X):
return self.estimator_.predict(X)
@property
def complexity_(self):
return self.estimator_.complexity_
class HSC45TreeClassifierCV(HSC45TreeClassifier):
def __init__(self, estimator_: C45TreeClassifier,
reg_param_list: List[float] = [0.1, 1, 10, 50, 100, 500], shrinkage_scheme_: str = 'node_based',
cv: int = 3, scoring='accuracy', *args, **kwargs):
"""Note: args, kwargs are not used but left so that imodels-experiments can still pass redundant args
"""
super().__init__(estimator_, reg_param=None)
self.reg_param_list = np.array(reg_param_list)
self.cv = cv
self.scoring = scoring
self.shrinkage_scheme_ = shrinkage_scheme_
# print('estimator', self.estimator_,
# 'checks.check_is_fitted(estimator)', checks.check_is_fitted(self.estimator_))
# if checks.check_is_fitted(self.estimator_):
# raise Warning('Passed an already fitted estimator,'
# 'but shrinking not applied until fit method is called.')
def fit(self, X, y, *args, **kwargs):
self.scores_ = []
for reg_param in self.reg_param_list:
est = HSC45TreeClassifier(copy.deepcopy(self.estimator_), reg_param)
cv_scores = cross_val_score(est, X, y, cv=self.cv, scoring=self.scoring)
self.scores_.append(np.mean(cv_scores))
self.reg_param = self.reg_param_list[np.argmax(self.scores_)]
super().fit(X=X, y=y)
if __name__ == '__main__':
X, y, feature_names = get_clean_dataset('ionosphere', data_source='pmlb')
m = C45TreeClassifier(max_rules=3)
m.fit(X, y)
s_m = HSC45TreeClassifier(estimator_=m)
s_m.fit(X, y)
preds = s_m.predict_proba(X)
Functions
def shrink_node(node, reg_param, parent_val, parent_num, cum_sum, scheme, constant)
-
Shrink the tree
Expand source code
def shrink_node(node, reg_param, parent_val, parent_num, cum_sum, scheme, constant): """Shrink the tree """ is_leaf = not node.hasChildNodes() # if self.prediction_task == 'regression': val = node.nodeValue is_root = parent_val is None and parent_num is None n_samples = len(node.labels) if (scheme != "leaf_based" or is_root) else parent_num if is_root: val_new = val else: reg_term = reg_param if scheme == "constant" else reg_param / parent_num val_new = (val - parent_val) / (1 + reg_term) cum_sum += val_new if is_leaf: if scheme == "leaf_based": v = constant + (val - constant) / (1 + reg_param / node.n_obs) node.nodeValue = v else: node.nodeValue = cum_sum else: for c in node.childNodes: shrink_node(c, reg_param, val, parent_num=n_samples, cum_sum=cum_sum, scheme=scheme, constant=constant) return node
Classes
class C45TreeClassifier (max_rules: int = None)
-
A C4.5 tree classifier.
Parameters
max_rules
:int
, optional(default=None)
- Maximum number of split nodes allowed in the tree
Expand source code
class C45TreeClassifier(BaseEstimator, ClassifierMixin): """A C4.5 tree classifier. Parameters ---------- max_rules : int, optional (default=None) Maximum number of split nodes allowed in the tree """ def __init__(self, max_rules: int = None): super().__init__() self.max_rules = max_rules def fit(self, X, y, feature_names: str = None): self.complexity_ = 0 # X, y = check_X_y(X, y) X, y, feature_names = check_fit_arguments(self, X, y, feature_names) self.resultType = type(y[0]) if feature_names is None: self.feature_names = [f'X_{x}' for x in range(X.shape[1])] else: # only include alphanumeric chars / replace spaces with underscores self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_') for x in feature_names] self.feature_names = [ 'X_' + x if x[0].isdigit() else x for x in self.feature_names ] assert len(self.feature_names) == X.shape[1] data = [[] for i in range(len(self.feature_names))] categories = [] for i in range(len(X)): categories.append(str(y[i])) for j in range(len(self.feature_names)): data[j].append(X[i][j]) root = ET.Element('GreedyTree') self.grow_tree(data, categories, root, self.feature_names) # adds to root self.tree_ = ET.tostring(root, encoding="unicode") # print('self.tree_', self.tree_) self.dom_ = minidom.parseString(self.tree_) return self def impute_nodes(self, X, y): """ Returns --- the leaf by which this sample would be classified """ source_node = self.root for i in range(len(y)): sample, label = X[i, ...], y[i] _add_label(source_node, label) nodes = [source_node] while len(nodes) > 0: node = nodes.pop() if not node.hasChildNodes(): continue else: att_name = node.firstChild.nodeName if att_name != "#text": att = sample[self.feature_names.index(att_name)] next_node = _get_next_node(node.childNodes, att) else: next_node = node.firstChild _add_label(next_node, label) nodes.append(next_node) self._calc_probs(source_node) # self.dom_.childNodes[0] = source_node # self.tree_.source = source_node def _calc_probs(self, node): node.nodeValue = np.mean(node.labels) if not node.hasChildNodes(): return for c in node.childNodes: self._calc_probs(c) def raw_preds(self, X): check_is_fitted(self, ['tree_', 'resultType', 'feature_names']) X = check_array(X) if isinstance(X, pd.DataFrame): X = deepcopy(X) X.columns = self.feature_names root = self.root prediction = [] for i in range(X.shape[0]): answerlist = decision(root, X[i], self.feature_names, 1) answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True) answer = answerlist[0][0] # prediction.append(self.resultType(answer)) prediction.append(float(answer)) return np.array(prediction) def predict(self, X): raw_preds = self.raw_preds(X) return (raw_preds > np.ones_like(raw_preds) * 0.5).astype(int) def predict_proba(self, X): raw_preds = self.raw_preds(X) return np.vstack((1 - raw_preds, raw_preds)).transpose() def __str__(self): check_is_fitted(self, ['tree_']) return self.dom_.toprettyxml(newl="\r\n") def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names): """ Parameters ---------- X_t: List[list] input data transposed (num_features x num_observations) y_str: List[str] outcome represented as strings parent attrs_names """ # check that y contains more than 1 distinct value if len(set(y_str)) > 1: split = [] # loop over features and build up potential splits for i in range(len(X_t)): if set(X_t[i]) == set("?"): split.append(0) else: if is_numeric_feature(X_t[i]): split.append(gain(y_str, X_t[i])) else: split.append(gain_ratio(y_str, X_t[i])) # no good split, return child node if max(split) == 0: set_as_leaf_node(parent, y_str) # there is a good split else: index_selected = split.index(max(split)) name_selected = str(attrs_names[index_selected]) self.complexity_ += 1 if is_numeric_feature(X_t[index_selected]): # split on this point split_point = get_best_split(y_str, X_t[index_selected]) # build up children nodes r_child_X = [[] for i in range(len(X_t))] r_child_y = [] l_child_X = [[] for i in range(len(X_t))] l_child_y = [] for i in range(len(y_str)): if not X_t[index_selected][i] == "?": if float(X_t[index_selected][i]) < float(split_point): l_child_y.append(y_str[i]) for j in range(len(X_t)): l_child_X[j].append(X_t[j][i]) else: r_child_y.append(y_str[i]) for j in range(len(X_t)): r_child_X[j].append(X_t[j][i]) # grow child nodes as well if len(l_child_y) > 0 and len(r_child_y) > 0 and ( self.max_rules is None or self.complexity_ <= self.max_rules ): p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?")) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))}) self.grow_tree(l_child_X, l_child_y, son, attrs_names) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))}) self.grow_tree(r_child_X, r_child_y, son, attrs_names) else: num_max = 0 for cat in set(y_str): num_cat = y_str.count(cat) if num_cat > num_max: num_max = num_cat most_cat = cat parent.text = most_cat else: # split on non-numeric variable (e.g. categorical) # create a leaf for each unique value for k in set(X_t[index_selected]): if not k == "?" and ( self.max_rules is None or self.complexity_ <= self.max_rules ): child_X = [[] for i in range(len(X_t))] child_y = [] for i in range(len(y_str)): if X_t[index_selected][i] == k: child_y.append(y_str[i]) for j in range(len(X_t)): child_X[j].append(X_t[j][i]) son = ET.SubElement(parent, name_selected, { 'feature': k, "flag": "m", 'p': str(round( float(len(child_y)) / ( len(X_t[index_selected]) - X_t[index_selected].count("?")), 3))}) self.grow_tree(child_X, child_y, son, attrs_names) else: parent.text = y_str[0] @property def root(self): return self.dom_.childNodes[0]
Ancestors
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.ClassifierMixin
Instance variables
var root
-
Expand source code
@property def root(self): return self.dom_.childNodes[0]
Methods
def fit(self, X, y, feature_names: str = None)
-
Expand source code
def fit(self, X, y, feature_names: str = None): self.complexity_ = 0 # X, y = check_X_y(X, y) X, y, feature_names = check_fit_arguments(self, X, y, feature_names) self.resultType = type(y[0]) if feature_names is None: self.feature_names = [f'X_{x}' for x in range(X.shape[1])] else: # only include alphanumeric chars / replace spaces with underscores self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_') for x in feature_names] self.feature_names = [ 'X_' + x if x[0].isdigit() else x for x in self.feature_names ] assert len(self.feature_names) == X.shape[1] data = [[] for i in range(len(self.feature_names))] categories = [] for i in range(len(X)): categories.append(str(y[i])) for j in range(len(self.feature_names)): data[j].append(X[i][j]) root = ET.Element('GreedyTree') self.grow_tree(data, categories, root, self.feature_names) # adds to root self.tree_ = ET.tostring(root, encoding="unicode") # print('self.tree_', self.tree_) self.dom_ = minidom.parseString(self.tree_) return self
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names)
-
Parameters
X_t
:List[list]
- input data transposed (num_features x num_observations)
y_str
:List[str]
- outcome represented as strings
parent
attrs_names
Expand source code
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names): """ Parameters ---------- X_t: List[list] input data transposed (num_features x num_observations) y_str: List[str] outcome represented as strings parent attrs_names """ # check that y contains more than 1 distinct value if len(set(y_str)) > 1: split = [] # loop over features and build up potential splits for i in range(len(X_t)): if set(X_t[i]) == set("?"): split.append(0) else: if is_numeric_feature(X_t[i]): split.append(gain(y_str, X_t[i])) else: split.append(gain_ratio(y_str, X_t[i])) # no good split, return child node if max(split) == 0: set_as_leaf_node(parent, y_str) # there is a good split else: index_selected = split.index(max(split)) name_selected = str(attrs_names[index_selected]) self.complexity_ += 1 if is_numeric_feature(X_t[index_selected]): # split on this point split_point = get_best_split(y_str, X_t[index_selected]) # build up children nodes r_child_X = [[] for i in range(len(X_t))] r_child_y = [] l_child_X = [[] for i in range(len(X_t))] l_child_y = [] for i in range(len(y_str)): if not X_t[index_selected][i] == "?": if float(X_t[index_selected][i]) < float(split_point): l_child_y.append(y_str[i]) for j in range(len(X_t)): l_child_X[j].append(X_t[j][i]) else: r_child_y.append(y_str[i]) for j in range(len(X_t)): r_child_X[j].append(X_t[j][i]) # grow child nodes as well if len(l_child_y) > 0 and len(r_child_y) > 0 and ( self.max_rules is None or self.complexity_ <= self.max_rules ): p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?")) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))}) self.grow_tree(l_child_X, l_child_y, son, attrs_names) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))}) self.grow_tree(r_child_X, r_child_y, son, attrs_names) else: num_max = 0 for cat in set(y_str): num_cat = y_str.count(cat) if num_cat > num_max: num_max = num_cat most_cat = cat parent.text = most_cat else: # split on non-numeric variable (e.g. categorical) # create a leaf for each unique value for k in set(X_t[index_selected]): if not k == "?" and ( self.max_rules is None or self.complexity_ <= self.max_rules ): child_X = [[] for i in range(len(X_t))] child_y = [] for i in range(len(y_str)): if X_t[index_selected][i] == k: child_y.append(y_str[i]) for j in range(len(X_t)): child_X[j].append(X_t[j][i]) son = ET.SubElement(parent, name_selected, { 'feature': k, "flag": "m", 'p': str(round( float(len(child_y)) / ( len(X_t[index_selected]) - X_t[index_selected].count("?")), 3))}) self.grow_tree(child_X, child_y, son, attrs_names) else: parent.text = y_str[0]
def impute_nodes(self, X, y)
-
Returns
the leaf by which this sample would be classified
Expand source code
def impute_nodes(self, X, y): """ Returns --- the leaf by which this sample would be classified """ source_node = self.root for i in range(len(y)): sample, label = X[i, ...], y[i] _add_label(source_node, label) nodes = [source_node] while len(nodes) > 0: node = nodes.pop() if not node.hasChildNodes(): continue else: att_name = node.firstChild.nodeName if att_name != "#text": att = sample[self.feature_names.index(att_name)] next_node = _get_next_node(node.childNodes, att) else: next_node = node.firstChild _add_label(next_node, label) nodes.append(next_node) self._calc_probs(source_node) # self.dom_.childNodes[0] = source_node # self.tree_.source = source_node
def predict(self, X)
-
Expand source code
def predict(self, X): raw_preds = self.raw_preds(X) return (raw_preds > np.ones_like(raw_preds) * 0.5).astype(int)
def predict_proba(self, X)
-
Expand source code
def predict_proba(self, X): raw_preds = self.raw_preds(X) return np.vstack((1 - raw_preds, raw_preds)).transpose()
def raw_preds(self, X)
-
Expand source code
def raw_preds(self, X): check_is_fitted(self, ['tree_', 'resultType', 'feature_names']) X = check_array(X) if isinstance(X, pd.DataFrame): X = deepcopy(X) X.columns = self.feature_names root = self.root prediction = [] for i in range(X.shape[0]): answerlist = decision(root, X[i], self.feature_names, 1) answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True) answer = answerlist[0][0] # prediction.append(self.resultType(answer)) prediction.append(float(answer)) return np.array(prediction)
def set_fit_request(self: C45TreeClassifier, *, feature_names: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> C45TreeClassifier
-
Request metadata passed to the
fit
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed tofit
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it tofit
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
feature_names
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
feature_names
parameter infit
.
Returns
self
:object
- The updated object.
Expand source code
def func(*args, **kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. " f"Accepted arguments are: {set(self.keys)}" ) # This makes it possible to use the decorated method as an unbound method, # for instance when monkeypatching. # https://github.com/scikit-learn/scikit-learn/issues/28632 if instance is None: _instance = args[0] args = args[1:] else: _instance = instance # Replicating python's behavior when positional args are given other than # `self`, and `self` is only allowed if this method is unbound. if args: raise TypeError( f"set_{self.name}_request() takes 0 positional argument but" f" {len(args)} were given" ) requests = _instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) _instance._metadata_request = requests return _instance
-
def set_score_request(self: C45TreeClassifier, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> C45TreeClassifier
-
Request metadata passed to the
score
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed toscore
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it toscore
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
sample_weight
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
sample_weight
parameter inscore
.
Returns
self
:object
- The updated object.
Expand source code
def func(*args, **kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. " f"Accepted arguments are: {set(self.keys)}" ) # This makes it possible to use the decorated method as an unbound method, # for instance when monkeypatching. # https://github.com/scikit-learn/scikit-learn/issues/28632 if instance is None: _instance = args[0] args = args[1:] else: _instance = instance # Replicating python's behavior when positional args are given other than # `self`, and `self` is only allowed if this method is unbound. if args: raise TypeError( f"set_{self.name}_request() takes 0 positional argument but" f" {len(args)} were given" ) requests = _instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) _instance._metadata_request = requests return _instance
-
class HSC45TreeClassifier (estimator_: C45TreeClassifier, reg_param: float = 1, shrinkage_scheme_: str = 'node_based')
-
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Notes
All estimators should specify all the parameters that can be set at the class level in their
__init__
as explicit keyword arguments (no*args
or**kwargs
).Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator >>> class MyEstimator(BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=2) >>> estimator.get_params() {'param': 2} >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([2, 2, 2]) >>> estimator.set_params(param=3).fit(X, y).predict(X) array([3, 3, 3])
Params
reg_param: float Higher is more regularization (can be arbitrarily large, should not be < 0)
shrinkage_scheme: str Experimental: Used to experiment with different forms of shrinkage. options are: (i) node_based shrinks based on number of samples in parent node (ii) leaf_based only shrinks leaf nodes based on number of leaf samples (iii) constant shrinks every node by a constant lambda
Expand source code
class HSC45TreeClassifier(BaseEstimator): def __init__(self, estimator_: C45TreeClassifier, reg_param: float = 1, shrinkage_scheme_: str = 'node_based'): """ Params ------ reg_param: float Higher is more regularization (can be arbitrarily large, should not be < 0) shrinkage_scheme: str Experimental: Used to experiment with different forms of shrinkage. options are: (i) node_based shrinks based on number of samples in parent node (ii) leaf_based only shrinks leaf nodes based on number of leaf samples (iii) constant shrinks every node by a constant lambda """ super().__init__() self.reg_param = reg_param # print('est', estimator_) self.estimator_ = estimator_ self.shrinkage_scheme_ = shrinkage_scheme_ def _calc_probs(self, node): self.estimator_._calc_probs(node) def impute_nodes(self, X, y): self.estimator_.impute_nodes(X, y) def shrink_tree(self): shrink_node(self.estimator_.root, self.reg_param, None, None, 0, self.shrinkage_scheme_, 0) def predict_proba(self, X): return self.estimator_.predict_proba(X) def fit(self, *args, **kwargs): X = kwargs['X'] if "X" in kwargs else args[0] y = kwargs['y'] if "y" in kwargs else args[1] if not hasattr(self.estimator_, "dom_"): self.estimator_.fit(X, y) self.impute_nodes(X, y) self.shrink_tree() def predict(self, X): return self.estimator_.predict(X) @property def complexity_(self): return self.estimator_.complexity_
Ancestors
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
Subclasses
Instance variables
var complexity_
-
Expand source code
@property def complexity_(self): return self.estimator_.complexity_
Methods
def fit(self, *args, **kwargs)
-
Expand source code
def fit(self, *args, **kwargs): X = kwargs['X'] if "X" in kwargs else args[0] y = kwargs['y'] if "y" in kwargs else args[1] if not hasattr(self.estimator_, "dom_"): self.estimator_.fit(X, y) self.impute_nodes(X, y) self.shrink_tree()
def impute_nodes(self, X, y)
-
Expand source code
def impute_nodes(self, X, y): self.estimator_.impute_nodes(X, y)
def predict(self, X)
-
Expand source code
def predict(self, X): return self.estimator_.predict(X)
def predict_proba(self, X)
-
Expand source code
def predict_proba(self, X): return self.estimator_.predict_proba(X)
def shrink_tree(self)
-
Expand source code
def shrink_tree(self): shrink_node(self.estimator_.root, self.reg_param, None, None, 0, self.shrinkage_scheme_, 0)
- setting and getting parameters used by
class HSC45TreeClassifierCV (estimator_: C45TreeClassifier, reg_param_list: List[float] = [0.1, 1, 10, 50, 100, 500], shrinkage_scheme_: str = 'node_based', cv: int = 3, scoring='accuracy', *args, **kwargs)
-
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Notes
All estimators should specify all the parameters that can be set at the class level in their
__init__
as explicit keyword arguments (no*args
or**kwargs
).Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator >>> class MyEstimator(BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=2) >>> estimator.get_params() {'param': 2} >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([2, 2, 2]) >>> estimator.set_params(param=3).fit(X, y).predict(X) array([3, 3, 3])
Note: args, kwargs are not used but left so that imodels-experiments can still pass redundant args
Expand source code
class HSC45TreeClassifierCV(HSC45TreeClassifier): def __init__(self, estimator_: C45TreeClassifier, reg_param_list: List[float] = [0.1, 1, 10, 50, 100, 500], shrinkage_scheme_: str = 'node_based', cv: int = 3, scoring='accuracy', *args, **kwargs): """Note: args, kwargs are not used but left so that imodels-experiments can still pass redundant args """ super().__init__(estimator_, reg_param=None) self.reg_param_list = np.array(reg_param_list) self.cv = cv self.scoring = scoring self.shrinkage_scheme_ = shrinkage_scheme_ # print('estimator', self.estimator_, # 'checks.check_is_fitted(estimator)', checks.check_is_fitted(self.estimator_)) # if checks.check_is_fitted(self.estimator_): # raise Warning('Passed an already fitted estimator,' # 'but shrinking not applied until fit method is called.') def fit(self, X, y, *args, **kwargs): self.scores_ = [] for reg_param in self.reg_param_list: est = HSC45TreeClassifier(copy.deepcopy(self.estimator_), reg_param) cv_scores = cross_val_score(est, X, y, cv=self.cv, scoring=self.scoring) self.scores_.append(np.mean(cv_scores)) self.reg_param = self.reg_param_list[np.argmax(self.scores_)] super().fit(X=X, y=y)
Ancestors
- HSC45TreeClassifier
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
Methods
def fit(self, X, y, *args, **kwargs)
-
Expand source code
def fit(self, X, y, *args, **kwargs): self.scores_ = [] for reg_param in self.reg_param_list: est = HSC45TreeClassifier(copy.deepcopy(self.estimator_), reg_param) cv_scores = cross_val_score(est, X, y, cv=self.cv, scoring=self.scoring) self.scores_.append(np.mean(cv_scores)) self.reg_param = self.reg_param_list[np.argmax(self.scores_)] super().fit(X=X, y=y)
- setting and getting parameters used by