Expand source code
``````from typing import List
from collections import Counter

from imodels.util.rule import Rule

def prune_mins(rules: List[Rule], precision_min: float, recall_min: float) -> List[Rule]:
# Factorize rules before semantic tree filtering
rules_ = [tuple(rule) for rule in rules]
rules_dict = {}

# keep only rules verifying precision_min and recall_min:
for rule, score in rules_:
if score >= precision_min and score >= recall_min:
if rule in rules_dict:
# update the score to the new mean
c = rules_dict[rule] + 1
b = rules_dict[rule] + 1. / c * (
score - rules_dict[rule])
a = rules_dict[rule] + 1. / c * (
score - rules_dict[rule])

rules_dict[rule] = (a, b, c)
else:
rules_dict[rule] = (score, score, 1)

rule_tuple_list = sorted(rules_dict.items(), key=lambda x: (x, x), reverse=True)
return [Rule(rule, args=scores) for rule, scores in rule_tuple_list]

def deduplicate(rules: List[Rule], max_depth_dup: int) -> List[Rule]:
if max_depth_dup is not None:
rules = [max(rules_set, key=f1_score) for rules_set in find_similar_rulesets(rules, max_depth_dup)]
return sorted(rules, key=lambda x: - f1_score(x))

def f1_score(rule: Rule) -> float:
return 2 * rule.args * rule.args / \
(rule.args + rule.args) if (rule.args + rule.args) > 0 else 0

def find_similar_rulesets(rules: List[Rule], max_depth_duplication: int = None) -> List[List[Rule]]:
"""Create clusters of rules using a decision tree based
on the terms of the rules

Parameters
----------
rules : List, List of rules
The rules that should be splitted in subsets of similar rules

Returns
-------
rules : List of list of rules
The different set of rules. Each set should be homogeneous

"""

def split_with_best_feature(rules, depth, exceptions=[]):
"""
Method to find a split of rules given most represented feature
"""
if depth == 0:
return rules

rulelist = [rule.split(' and ') for rule, score in rules]
terms = [t.split(' ') for term in rulelist for t in term]
counter = Counter(terms)
# Drop exception list
for exception in exceptions:
del counter[exception]

if len(counter) == 0:
return rules

most_represented_term = counter.most_common()

# Proceed to split
rules_splitted = [[], [], []]
for rule in rules:
if (most_represented_term + ' <=') in rule.rule:
rules_splitted.append(rule)
elif (most_represented_term + ' >') in rule.rule:
rules_splitted.append(rule)
else:
rules_splitted.append(rule)
new_exceptions = exceptions + [most_represented_term]

# Choose best term
return [split_with_best_feature(ruleset,
depth - 1,
exceptions=new_exceptions)
for ruleset in rules_splitted]

if len(rules) == 0 or not isinstance(rules, list):
if len(rules) > 0:
return leaves.append(rules)
else:
for rules_child in rules:
return leaves

leaves = []
res = split_with_best_feature(rules, max_depth_duplication)
return leaves``````

## Functions

``` def deduplicate(rules: List[Rule], max_depth_dup: int) ‑> List[Rule] ```
Expand source code
``````def deduplicate(rules: List[Rule], max_depth_dup: int) -> List[Rule]:
if max_depth_dup is not None:
rules = [max(rules_set, key=f1_score) for rules_set in find_similar_rulesets(rules, max_depth_dup)]
return sorted(rules, key=lambda x: - f1_score(x))``````
``` def f1_score(rule: Rule) ‑> float ```
Expand source code
``````def f1_score(rule: Rule) -> float:
return 2 * rule.args * rule.args / \
(rule.args + rule.args) if (rule.args + rule.args) > 0 else 0``````
``` def find_similar_rulesets(rules: List[Rule], max_depth_duplication: int = None) ‑> List[List[Rule]] ```

Create clusters of rules using a decision tree based on the terms of the rules

## Parameters

`rules` : `List, List` of `rules`
The rules that should be splitted in subsets of similar rules

## Returns

`rules` : `List` of `list` of `rules`
The different set of rules. Each set should be homogeneous
Expand source code
``````def find_similar_rulesets(rules: List[Rule], max_depth_duplication: int = None) -> List[List[Rule]]:
"""Create clusters of rules using a decision tree based
on the terms of the rules

Parameters
----------
rules : List, List of rules
The rules that should be splitted in subsets of similar rules

Returns
-------
rules : List of list of rules
The different set of rules. Each set should be homogeneous

"""

def split_with_best_feature(rules, depth, exceptions=[]):
"""
Method to find a split of rules given most represented feature
"""
if depth == 0:
return rules

rulelist = [rule.split(' and ') for rule, score in rules]
terms = [t.split(' ') for term in rulelist for t in term]
counter = Counter(terms)
# Drop exception list
for exception in exceptions:
del counter[exception]

if len(counter) == 0:
return rules

most_represented_term = counter.most_common()

# Proceed to split
rules_splitted = [[], [], []]
for rule in rules:
if (most_represented_term + ' <=') in rule.rule:
rules_splitted.append(rule)
elif (most_represented_term + ' >') in rule.rule:
rules_splitted.append(rule)
else:
rules_splitted.append(rule)
new_exceptions = exceptions + [most_represented_term]

# Choose best term
return [split_with_best_feature(ruleset,
depth - 1,
exceptions=new_exceptions)
for ruleset in rules_splitted]

if len(rules) == 0 or not isinstance(rules, list):
if len(rules) > 0:
return leaves.append(rules)
else:
for rules_child in rules:
return leaves

leaves = []
res = split_with_best_feature(rules, max_depth_duplication)
return leaves``````
``` def prune_mins(rules: List[Rule], precision_min: float, recall_min: float) ‑> List[Rule] ```
Expand source code
``````def prune_mins(rules: List[Rule], precision_min: float, recall_min: float) -> List[Rule]:
# Factorize rules before semantic tree filtering
rules_ = [tuple(rule) for rule in rules]
rules_dict = {}

# keep only rules verifying precision_min and recall_min:
for rule, score in rules_:
if score >= precision_min and score >= recall_min:
if rule in rules_dict:
# update the score to the new mean
c = rules_dict[rule] + 1
b = rules_dict[rule] + 1. / c * (
score - rules_dict[rule])
a = rules_dict[rule] + 1. / c * (
score - rules_dict[rule])

rules_dict[rule] = (a, b, c)
else:
rules_dict[rule] = (score, score, 1)

rule_tuple_list = sorted(rules_dict.items(), key=lambda x: (x, x), reverse=True)
return [Rule(rule, args=scores) for rule, scores in rule_tuple_list]``````