Module imodelsx.augtree.stump
Classes
class Stump (max_features=5,
split_strategy='cart',
tokenizer=None,
refinement_strategy='None',
use_refine_ties: bool = False,
assert_checks: bool = False,
llm_prompt_context: str = '',
embs_manager: EmbsManager = None,
verbose: bool = True,
use_stemming: bool = False,
cache_expansions_dir: str = None)-
Expand source code
class Stump(): def __init__( self, max_features=5, split_strategy='cart', tokenizer=None, refinement_strategy='None', use_refine_ties: bool=False, assert_checks: bool=False, llm_prompt_context: str='', embs_manager: EmbsManager=None, verbose: bool=True, use_stemming: bool=False, cache_expansions_dir: str=None, ): """Fit a single stump. Currently only supports binary classification with binary features. """ self.max_features = max_features self.split_strategy = split_strategy self.use_refine_ties = use_refine_ties self.child_left = None self.child_right = None self.assert_checks = assert_checks self.llm_prompt_context = llm_prompt_context self.refinement_strategy = refinement_strategy self.verbose = verbose self.embs_manager = embs_manager self.use_stemming = use_stemming self.cache_expansions_dir = cache_expansions_dir if tokenizer is None: self.tokenizer = imodelsx.augtree.utils.get_spacy_tokenizer(use_stemming=use_stemming) else: self.tokenizer = tokenizer if self.split_strategy == 'cart': self.criterion = 'gini' elif self.split_strategy == 'id3': self.criterion = 'entropy' elif self.split_strategy == 'mse': self.criterion = 'mse' def fit(self, X, y, feature_names=None, X_text=None): # check input and set some attributes assert len(np.unique(y)) > 1, 'y should have more than 1 unique value' if not isinstance(self, RegressorMixin): assert len(np.unique(y)) <= 2, 'only binary classification is supported' X, y, _ = imodels.util.arguments.check_fit_arguments( self, X, y, feature_names) self.feature_names = feature_names if isinstance(self.feature_names, list): self.feature_names = np.array(self.feature_names).flatten() # fit stump if self.split_strategy == 'linear': if isinstance(self, RegressorMixin): raise NotImplementedError('linear split strategy not implemented for regression') self.stump_keywords_idxs = self._get_stump_keywords_linear(X, y) else: self.stump_keywords_idxs = self._get_stump_keywords_cart(X, y) self.stump_keywords = self.feature_names[self.stump_keywords_idxs] # set value self._set_value_acc_samples(X_text, y) if self.failed_to_split: return self # checks if self.assert_checks: preds_text = self.predict(X_text=X_text, predict_strategy='text') preds_tab = self.predict(X=X, predict_strategy='tabular') assert np.all( preds_text == preds_tab), 'predicting with text and tabular should give same results' assert self.value[1] > self.value[0], 'right child should have greater val than left but value=' + \ str(self.value) assert self.value[1] > self.value_mean, 'right child should have greater val than parent ' + \ str(self.value) # refine llm keywords if not self.refinement_strategy == 'None': if self.verbose: logging.debug(f'\t\tbefore refining acc {self.acc:0.4f}') self.stump_keywords_refined = self._refine_keywords( self.stump_keywords, X_text, y, tokenizer=self.tokenizer, ) self._set_value_acc_samples(X_text, y) if self.verbose: logging.debug(f'\t\trefined acc {self.acc:.4f} {self.stump_keywords_refined[0]} -> {self.stump_keywords_refined[:5]}...') return self def predict(self, X=None, X_text: List[str] = None, predict_strategy='text', keywords=None) -> np.ndarray[int]: """Returns prediction 1 for positive and 0 for negative. """ assert not (predict_strategy == 'tabular' and X is None) assert not (predict_strategy == 'text' and X_text is None) if predict_strategy == 'tabular': X = imodels.util.arguments.check_fit_X(X) # predict whether input has any of the features in stump_keywords_idxs X_feats = X[:, self.stump_keywords_idxs] pred = np.any(X_feats, axis=1) if self.pos_or_neg == 'pos': return pred.astype(int) else: return 1 - pred elif predict_strategy == 'text': if not keywords: if hasattr(self, 'stump_keywords_refined'): keywords = self.stump_keywords_refined else: keywords = self.stump_keywords ngrams_used_to_predict = max( [len(keyword.split(' ')) for keyword in keywords]) def contains_any_of_keywords(text): text = text.lower() text = imodelsx.util.generate_ngrams_list( text, ngrams=ngrams_used_to_predict, tokenizer_ngrams=self.tokenizer, all_ngrams=True ) for keyword in keywords: if keyword in text: return 1 return 0 contains_keywords = 1 * \ np.array([contains_any_of_keywords(x) for x in X_text]) if self.pos_or_neg == 'pos': return contains_keywords else: return 1 - contains_keywords def predict_regression(self, X_text, **kwargs): preds_binary = self.predict(X_text=X_text, **kwargs) return preds_binary * self.value[1] + (1 - preds_binary) * self.value[0] def _get_stump_keywords_cart(self, X, y): '''iteratively select the feature selected by DecisionTreeClassifier removes that feature, and repeats ''' if self.criterion == 'gini': criterion_func = imodelsx.augtree.utils.impurity_gini elif self.criterion == 'entropy': criterion_func = imodelsx.augtree.utils.impurity_entropy elif self.criterion == 'mse': criterion_func = imodelsx.augtree.utils.impurity_mse # Calculate the gini impurity reduction for each (binary) feature in X impurity_reductions = [] # whether the feature increases the likelihood of the positive class feature_positive = [] n = y.size gini_impurity = criterion_func(y) # assert gini_impurity_1 == gini_impurity, 'gini impurity should be the same' for i in range(X.shape[1]): x = X[:, i] idxs_r = x > 0.5 idxs_l = x <= 0.5 if idxs_r.sum() == 0 or idxs_l.sum() == 0: impurity_reductions.append(0) feature_positive.append(True) else: gini_impurity_l = criterion_func(y[idxs_l]) gini_impurity_r = criterion_func(y[idxs_r]) # print('l', indexes_l.sum(), 'r', indexes_r.sum(), 'n', n) impurity_reductions.append( gini_impurity - (idxs_l.sum() / n) * gini_impurity_l - (idxs_r.sum() / n) * gini_impurity_r ) feature_positive.append(np.mean(y[idxs_r]) > np.mean(y[idxs_l])) impurity_reductions = np.array(impurity_reductions) feature_positive = np.arange(X.shape[1])[np.array(feature_positive)] # find the top self.max_features with the largest impurity reductions args_largest_reduction_first = np.argsort(impurity_reductions)[::-1] self.impurity_reductions = impurity_reductions[args_largest_reduction_first][:self.max_features] # print('\ttop_impurity_reductions', impurity_reductions[args_largest_reduction_first][:5], # 'max', max(impurity_reductions)) # print(f'\t{X.shape=}') imp_pos_top = [ k for k in args_largest_reduction_first if k in feature_positive and not k in STOPWORDS ][:self.max_features] imp_neg_top = [ k for k in args_largest_reduction_first if not k in feature_positive and not k in STOPWORDS ][:self.max_features] # feat = DecisionTreeClassifier(max_depth=1).fit(X, y).tree_.feature[0] if np.sum(imp_pos_top) > np.sum(imp_neg_top): self.pos_or_neg = 'pos' return imp_pos_top else: self.pos_or_neg = 'neg' return imp_neg_top def _refine_keywords( self, keywords: List[str], X_text: List[str], y, tokenizer, max_words_in_single_keyword=4, ) -> List[str]: """Refine each keyword using LLM. Greedily add extra keywords based on whether they improve acc (should change to impurity). Return list corresponding to the candidates from the single best keyword """ if isinstance(self, RegressorMixin): criterion_func = imodelsx.augtree.utils.mse_score predict_func = self.predict_regression else: criterion_func = imodelsx.augtree.utils.gini_score predict_func = self.predict criterions_keyword = [] candidates_list_keyword = [] for i in range(len(keywords)): # get keyword keyword = keywords[i] # get refined_keywords if self.refinement_strategy == 'llm': keywords_refined = imodelsx.augtree.llm.expand_keyword(keyword, self.llm_prompt_context, cache_dir=self.cache_expansions_dir) elif self.refinement_strategy == 'embs': keywords_refined = self.embs_manager.expand_keyword(keyword) # filter out keywords that are too long keywords_refined = [ k for k in keywords_refined if len(k.split()) <= max_words_in_single_keyword ] if self.use_stemming: # apply tokenizer to each unigram and combine keywords_refined = [ ' '.join([str(tok) for tok in tokenizer(str(word))]) for word in keywords_refined ] # drop things that end up too short keywords_refined = [ k for k in keywords_refined if len(k) > 2 ] # greedily grow words one at a time, testing acc with each new word words = [keyword] criterion_max = criterion_func(y, predict_func(X_text=X_text, keywords=words)) for keyword_refined in keywords_refined: preds_check = predict_func( X_text=X_text, keywords=words + [keyword_refined]) # if acc improved, add the refined_keyword crit = criterion_func(y, preds_check) if self.use_refine_ties: check_crit = crit >= criterion_max else: check_crit = crit > criterion_max if check_crit: words.append(keyword_refined) criterion_max = crit logging.debug(f'\t\t\tadded {repr(keyword_refined)}') # append the results criterions_keyword.append(criterion_max) candidates_list_keyword.append(words) # print(f'\t\t {i} refined acc {acc_max:.4f}', # keyword, '->', words[:5], '...') idx_best = np.argmax(criterions_keyword) keywords_best = [ ' '.join([str(tok) for tok in tokenizer(str(word))]) # clean up word for word in candidates_list_keyword[idx_best] ] # print(f'\trefined acc {criterion_max:.4f}', keywords[idx_best], '->', keywords_best[:5], '...') return keywords_best def _set_value_acc_samples(self, X_text, y): """Set value and accuracy of stump. """ idxs_right = self.predict(X_text=X_text).astype(bool) n_right = idxs_right.sum() if n_right == 0 or n_right == y.size: self.failed_to_split = True return else: self.failed_to_split = False self.n_samples = [y.size - n_right, n_right] self.value = [np.mean(y[~idxs_right]), np.mean(y[idxs_right])] self.value_mean = np.mean(y) if isinstance(self, RegressorMixin): preds = self.value[1] * idxs_right + self.value[0] * ~idxs_right self.acc = imodelsx.augtree.utils.mse_score(y, preds) else: preds = 1 * idxs_right self.acc = accuracy_score(y, preds) # self.impurity_reduction = gini_binary(y.mean()) - \ # gini_binary(self.value[1]) / self.n_samples[1] * y.size - \ # gini_binary(self.value[0]) / self.n_samples[0] * y.size def __str__(self): if hasattr(self, 'stump_keywords_refined'): keywords = self.stump_keywords_refined else: keywords = self.stump_keywords keywords_str = ", ".join(keywords[:5]) if len(keywords) > 5: keywords_str += f'...({len(keywords) - 5} more)' sign = {'pos': '+', 'neg': '--'}[self.pos_or_neg] return f'Stump(val={self.value_mean:0.2f} n={self.n_samples}) {sign} {keywords_str}' def get_str_simple(self): if hasattr(self, 'stump_keywords_refined'): keywords = self.stump_keywords_refined else: keywords = self.stump_keywords keywords_str = ", ".join(keywords[:5]) if len(keywords) > 5: keywords_str += f'...({len(keywords) - 5} more)' sign = {'pos': '+', 'neg': '--'}[self.pos_or_neg] return f'{sign} {keywords_str}' def _get_stump_keywords_linear(self, X, y): # fit a linear model m = LogisticRegression().fit(X, y) m.fit(X, y) # find the largest magnitude coefs abs_feature_idxs = m.coef_.argsort().flatten() bot_feature_idxs = abs_feature_idxs[:self.max_features] top_feature_idxs = abs_feature_idxs[-self.max_features:][::-1] # return the features with the largest magnitude coefs if np.sum(abs(bot_feature_idxs)) > np.sum(abs(top_feature_idxs)): self.pos_or_neg = 'neg' return bot_feature_idxs else: self.pos_or_neg = 'pos' return top_feature_idxs
Fit a single stump. Currently only supports binary classification with binary features.
Subclasses
Methods
def fit(self, X, y, feature_names=None, X_text=None)
-
Expand source code
def fit(self, X, y, feature_names=None, X_text=None): # check input and set some attributes assert len(np.unique(y)) > 1, 'y should have more than 1 unique value' if not isinstance(self, RegressorMixin): assert len(np.unique(y)) <= 2, 'only binary classification is supported' X, y, _ = imodels.util.arguments.check_fit_arguments( self, X, y, feature_names) self.feature_names = feature_names if isinstance(self.feature_names, list): self.feature_names = np.array(self.feature_names).flatten() # fit stump if self.split_strategy == 'linear': if isinstance(self, RegressorMixin): raise NotImplementedError('linear split strategy not implemented for regression') self.stump_keywords_idxs = self._get_stump_keywords_linear(X, y) else: self.stump_keywords_idxs = self._get_stump_keywords_cart(X, y) self.stump_keywords = self.feature_names[self.stump_keywords_idxs] # set value self._set_value_acc_samples(X_text, y) if self.failed_to_split: return self # checks if self.assert_checks: preds_text = self.predict(X_text=X_text, predict_strategy='text') preds_tab = self.predict(X=X, predict_strategy='tabular') assert np.all( preds_text == preds_tab), 'predicting with text and tabular should give same results' assert self.value[1] > self.value[0], 'right child should have greater val than left but value=' + \ str(self.value) assert self.value[1] > self.value_mean, 'right child should have greater val than parent ' + \ str(self.value) # refine llm keywords if not self.refinement_strategy == 'None': if self.verbose: logging.debug(f'\t\tbefore refining acc {self.acc:0.4f}') self.stump_keywords_refined = self._refine_keywords( self.stump_keywords, X_text, y, tokenizer=self.tokenizer, ) self._set_value_acc_samples(X_text, y) if self.verbose: logging.debug(f'\t\trefined acc {self.acc:.4f} {self.stump_keywords_refined[0]} -> {self.stump_keywords_refined[:5]}...') return self
def get_str_simple(self)
-
Expand source code
def get_str_simple(self): if hasattr(self, 'stump_keywords_refined'): keywords = self.stump_keywords_refined else: keywords = self.stump_keywords keywords_str = ", ".join(keywords[:5]) if len(keywords) > 5: keywords_str += f'...({len(keywords) - 5} more)' sign = {'pos': '+', 'neg': '--'}[self.pos_or_neg] return f'{sign} {keywords_str}'
def predict(self, X=None, X_text: List[str] = None, predict_strategy='text', keywords=None) ‑> numpy.ndarray[int]
-
Expand source code
def predict(self, X=None, X_text: List[str] = None, predict_strategy='text', keywords=None) -> np.ndarray[int]: """Returns prediction 1 for positive and 0 for negative. """ assert not (predict_strategy == 'tabular' and X is None) assert not (predict_strategy == 'text' and X_text is None) if predict_strategy == 'tabular': X = imodels.util.arguments.check_fit_X(X) # predict whether input has any of the features in stump_keywords_idxs X_feats = X[:, self.stump_keywords_idxs] pred = np.any(X_feats, axis=1) if self.pos_or_neg == 'pos': return pred.astype(int) else: return 1 - pred elif predict_strategy == 'text': if not keywords: if hasattr(self, 'stump_keywords_refined'): keywords = self.stump_keywords_refined else: keywords = self.stump_keywords ngrams_used_to_predict = max( [len(keyword.split(' ')) for keyword in keywords]) def contains_any_of_keywords(text): text = text.lower() text = imodelsx.util.generate_ngrams_list( text, ngrams=ngrams_used_to_predict, tokenizer_ngrams=self.tokenizer, all_ngrams=True ) for keyword in keywords: if keyword in text: return 1 return 0 contains_keywords = 1 * \ np.array([contains_any_of_keywords(x) for x in X_text]) if self.pos_or_neg == 'pos': return contains_keywords else: return 1 - contains_keywords
Returns prediction 1 for positive and 0 for negative.
def predict_regression(self, X_text, **kwargs)
-
Expand source code
def predict_regression(self, X_text, **kwargs): preds_binary = self.predict(X_text=X_text, **kwargs) return preds_binary * self.value[1] + (1 - preds_binary) * self.value[0]
class StumpClassifier (max_features=5,
split_strategy='cart',
tokenizer=None,
refinement_strategy='None',
use_refine_ties: bool = False,
assert_checks: bool = False,
llm_prompt_context: str = '',
embs_manager: EmbsManager = None,
verbose: bool = True,
use_stemming: bool = False,
cache_expansions_dir: str = None)-
Expand source code
class StumpClassifier(Stump, ClassifierMixin): ...
Mixin class for all classifiers in scikit-learn.
This mixin defines the following functionality:
- set estimator type to
"classifier"
through theestimator_type
tag; score
method that default to :func:~sklearn.metrics.accuracy_score
.- enforce that
fit
requiresy
to be passed through therequires_y
tag, which is done by setting the classifier type tag.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator, ClassifierMixin >>> # Mixin classes should always be on the left-hand side for a correct MRO >>> class MyEstimator(ClassifierMixin, BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=1) >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([1, 1, 1]) >>> estimator.score(X, y) 0.66...
Fit a single stump. Currently only supports binary classification with binary features.
Ancestors
- Stump
- sklearn.base.ClassifierMixin
Inherited members
- set estimator type to
class StumpRegressor (max_features=5,
split_strategy='cart',
tokenizer=None,
refinement_strategy='None',
use_refine_ties: bool = False,
assert_checks: bool = False,
llm_prompt_context: str = '',
embs_manager: EmbsManager = None,
verbose: bool = True,
use_stemming: bool = False,
cache_expansions_dir: str = None)-
Expand source code
class StumpRegressor(Stump, RegressorMixin): ...
Mixin class for all regression estimators in scikit-learn.
This mixin defines the following functionality:
- set estimator type to
"regressor"
through theestimator_type
tag; score
method that default to :func:~sklearn.metrics.r2_score
.- enforce that
fit
requiresy
to be passed through therequires_y
tag, which is done by setting the regressor type tag.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator, RegressorMixin >>> # Mixin classes should always be on the left-hand side for a correct MRO >>> class MyEstimator(RegressorMixin, BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=0) >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([-1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([0, 0, 0]) >>> estimator.score(X, y) 0.0
Fit a single stump. Currently only supports binary classification with binary features.
Ancestors
- Stump
- sklearn.base.RegressorMixin
Inherited members
- set estimator type to