Expand source code
#######Supplement for "Interpretable classifiers using rules and Bayesian analysis: Building a better stroke prediction model."

###LICENSE
#
# This software is released under the MIT license.
#
# Copyright (c) 2013-14 Ben Letham
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# The author/copyright holder can be contacted at bletham@mit.edu

####README
# It is specific to binary classification with binary features (although could
# easily be extended to multiclass).
#
#
# ##OUTPUT
#
# The highest level function, "topscript," returns:
#
# - permsdic - Contains the important information from the MCMC sampling. A
# dictionary whose keys are a string Pickle-dump of the antecedent list d, and
# whose values are a list [a,b] where a is (proportional to) the log posterior of
# d, and b is the number of times d is present in the MCMC samples.
# - d_star - the BRL-point antecedent list. A list of indices corresponding to
# variable "itemsets."
# - itemsets - A list of itemsets. itemsets[d_star[i]] is the antecedent in
# position i on the BRL-point list
# - theta - A list of the expected value of the posterior consequent
# distribution for each entry in BRL-point.
# - ci_theta - A list of tuples, each the 95% credible interval for the
# corresponding theta.
# - preds_d_star - Predictions on the demo data made using d_star and theta.
# - accur_d_star - The accuracy of the BRL-point predictions, with the decision
# boundary at 0.5.
# - preds_fullpost - Predictions on the demo data using the full posterior
# (BRL-post)
# - accur_fullpost - The accuracy of the BRL-post predictions, decision boundary
# at 0.5.
#
import pickle as Pickle
import time
from collections import defaultdict

from numpy import *
from scipy.special import gammaln
from scipy.stats import poisson, beta

try:
    from matplotlib import pyplot as plt
except:
    pass


###############BRL


def default_permsdic():
    '''For producing the defaultdict used for storing MCMC results
    '''
    return [0., 0.]


def reset_permsdic(permsdic):
    '''Resets the number of MCMC samples stored (value[1]) while maintaining the
    log-posterior value (so it doesn't need to be re-computed in future chains).
    '''
    for perm in permsdic:
        permsdic[perm][1] = 0.
    return permsdic


def run_bdl_multichain_serial(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin,
                              nchains, d_inits, verbose=True, seed=42):
    '''Run mcmc for each of the chains in serial
    '''
    # random seed
    random.seed(seed)

    # Run each chain
    t1 = time.process_time()
    if verbose:
        print('Starting mcmc chains')
    res = {}
    for n in range(nchains):
        res[n] = mcmcchain(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin,
                           nchains, d_inits[n])

    if verbose:
        print('Elapsed CPU time', time.process_time() - t1)

    # Check convergence
    Rhat = gelmanrubin(res)

    if verbose:
        print('Rhat for convergence:', Rhat)
    ##plot?
    # plot_chains(res)
    return res, Rhat


def mcmcchain(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin, nchains,
              d_init):
    '''Run and store mcmc chain
    '''
    res = {}
    permsdic, res['perms'] = bayesdl_mcmc(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs,
                                          permsdic, burnin, None, d_init)
    # Store the permsdic results
    res['permsdic'] = {perm: list(vals) for perm, vals in permsdic.items() if vals[1] > 0}
    # Reset the permsdic
    permsdic = reset_permsdic(permsdic)
    return res


def gelmanrubin(res):
    '''Check convergence with GR diagnostic
    '''
    n = 0  # number of samples per chain - to be computed
    m = len(res)  # number of chains
    phi_bar_j = {}
    for chain in res:
        phi_bar_j[chain] = 0.
        for val in res[chain]['permsdic'].values():
            phi_bar_j[chain] += val[1] * val[0]  # numsamples*log posterior
            n += val[1]
    # And _normalize
    n = n // m  # Number of samples per chain (assuming all m chains have same number of samples)
    # Normalize, and compute phi_bar
    phi_bar = 0.
    for chain in phi_bar_j:
        phi_bar_j[chain] = phi_bar_j[chain] / float(n)  # _normalize
        phi_bar += phi_bar_j[chain]
    phi_bar = phi_bar / float(m)  # phi_bar = average of phi_bar_j
    # Now B
    B = 0.
    for chain in phi_bar_j:
        B += (phi_bar_j[chain] - phi_bar) ** 2
    B = B * (n / float(m - 1))
    # Now W.
    W = 0.
    for chain in res:
        s2_j = 0.
        for val in res[chain]['permsdic'].values():
            s2_j += val[1] * (val[0] - phi_bar_j[chain]) ** 2
        s2_j = (1. / float(n - 1)) * s2_j
        W += s2_j
    W = W * (1. / float(m))
    # Next varhat
    varhat = ((n - 1) / float(n)) * W + (1. / float(n)) * B
    # And finally,
    try:
        Rhat = sqrt(varhat / float(W))
    except:
        print('RuntimeWarning computing Rhat, W=' + str(W) + ', B=' + str(B))
        Rhat = 0.
    return Rhat


def plot_chains(res):
    '''Plot the logposterior values for the samples in the chains.
    '''
    for chain in res:
        plt.plot([res[chain]['permsdic'][a][0] for a in res[chain]['perms']])
    plt.show()
    return


def merge_chains(res):
    '''Merge chains into a single collection of posterior samples
    '''
    permsdic = defaultdict(default_permsdic)
    for n in res:
        for perm, vals in res[n]['permsdic'].items():
            permsdic[perm][0] = vals[0]
            permsdic[perm][1] += vals[1]
    return permsdic


def get_point_estimate(permsdic, lhs_len, X, Y, alpha, nruleslen, maxlhs, lbda, eta, verbose=True):
    '''Get a point estimate with length and width similar to the posterior average, with highest likelihood
    '''
    # Figure out the posterior expected list length and average rule size
    listlens = []
    rulesizes = []
    for perm in permsdic:
        #         with open(perm, 'rb') as file:
        #             d_t = pickle.loads(file)
        #         print('perm', perm, type(perm))
        #         print('perm list', list(perm))
        #         d_t = Pickle.loads(bytes(perm, encoding="latin1")) #, encoding='bytes')
        d_t = Pickle.loads(perm)  # , encoding='bytes')

        listlens.extend([len(d_t)] * int(permsdic[perm][1]))
        rulesizes.extend([lhs_len[j] for j in d_t[:-1]] * int(permsdic[perm][1]))

    # Now compute average
    avglistlen = average(listlens)
    if verbose:
        print('Posterior average length:', avglistlen)
    try:
        avgrulesize = average(rulesizes)
        if verbose:
            print('Posterior average width:', avgrulesize)
        # Prepare the intervals
        minlen = int(floor(avglistlen))
        maxlen = int(ceil(avglistlen))
        minrulesize = int(floor(avgrulesize))
        maxrulesize = int(ceil(avgrulesize))
        # Run through all perms again
        likelihoods = []
        d_ts = []
        beta_Z, logalpha_pmf, logbeta_pmf = prior_calculations(lbda, len(X), eta,
                                                               maxlhs)  # get the constants needed to compute the prior
        for perm in permsdic:
            if permsdic[perm][1] > 0:
                d_t = Pickle.loads(perm)  # this is the antecedent list

                # Check the list length
                if len(d_t) >= minlen and len(d_t) <= maxlen:

                    # Check the rule size
                    rulesize = average([lhs_len[j] for j in d_t[:-1]])
                    if rulesize >= minrulesize and rulesize <= maxrulesize:
                        d_ts.append(d_t)

                        # Compute the likelihood
                        R_t = d_t.index(0)
                        N_t = compute_rule_usage(d_t, R_t, X, Y)
                        likelihoods.append(
                            fn_logposterior(d_t, R_t, N_t, alpha, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen,
                                            lhs_len))
        likelihoods = array(likelihoods)
        d_star = d_ts[likelihoods.argmax()]
    except RuntimeWarning:
        # This can happen if all perms are identically [0], or if no soln is found within the len and width bounds (probably the chains didn't converge)
        print('No suitable point estimate found')
        d_star = None
    return d_star


#################COMPUTING RESULTS
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals):
    '''Compute the posterior consequent distributions
    (Basically compute points in each part of rule)
    '''
    N_t = compute_rule_usage(d_t, d_t.index(0), Xtrain, Ytrain)
    theta = []  # P(Y=1)
    ci_theta = []  # confidence interval for Y=1
    for i, j in enumerate(d_t):
        # theta ~ Dirichlet(n_rules[j,:] + alpha)
        # E[theta] = (n_rules[j,:] + alpha)/float(sum(n_rules[j,:] + alpha))
        # NOTE this result is only for binary classification
        # theta = p(y=1)
        theta.append((N_t[i, 1] + alpha[1]) / float(sum(N_t[i, :] + alpha)))
        # And now the 95% interval, for Beta(n_rules[j,1] + alpha[1], n_rules[j,0] + alpha[0])
        if intervals:
            ci_theta.append(beta.interval(0.95, N_t[i, 1] + alpha[1], N_t[i, 0] + alpha[0]))
    return theta, ci_theta


def preds_d_t(X, Y, d_t, theta):
    '''Get predictions from the list d_t
    '''
    # this is binary only. The score is the Prob of 1.
    unused = set(range(Y.shape[0]))
    preds = -1 * ones(Y.shape[0])
    for i, j in enumerate(d_t):
        usedj = unused.intersection(X[j])  # these are the observations in X that make it to rule j
        preds[list(usedj)] = theta[i]
        unused = unused.difference(set(usedj))
    if preds.min() < 0:
        raise Exception  # this means some observation wasn't given a prediction - shouldn't happen
    return preds


##############MCMC core 
def bayesdl_mcmc(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin, rseed,
                 d_init):
    '''Run Metropolis-Hastings algorithm
    '''
    # initialize
    perms = []
    if rseed:
        random.seed(rseed)

    # Do some pre-computation for the prior
    beta_Z, logalpha_pmf, logbeta_pmf = prior_calculations(lbda, len(X), eta, maxlhs)
    if d_init:  # If we want to begin our chain at a specific place (e.g. to continue a chain)
        d_t = Pickle.loads(d_init)
        d_t.extend([i for i in range(len(X)) if i not in d_t])
        R_t = d_t.index(0)
        N_t = compute_rule_usage(d_t, R_t, X, Y)
    else:
        d_t, R_t, N_t = initialize_d(X, Y, lbda, eta, lhs_len, maxlhs,
                                     nruleslen)  # Otherwise sample the initial value from the prior

    # Add to dictionary which will store the sampling results
    a_t = Pickle.dumps(d_t[:R_t + 1])  # The antecedent list in string form
    if a_t not in permsdic:
        permsdic[a_t][0] = fn_logposterior(d_t, R_t, N_t, alpha, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen,
                                           lhs_len)  # Compute its logposterior
    if burnin == 0:
        permsdic[a_t][1] += 1  # store the initialization sample

    # iterate!
    for itr in range(numiters):
        # Sample from proposal distribution
        d_star, Jratio, R_star, step = proposal(d_t, R_t, X, Y, alpha)
        # Compute the new posterior value, if necessary
        a_star = Pickle.dumps(d_star[:R_star + 1])
        if a_star not in permsdic:
            N_star = compute_rule_usage(d_star, R_star, X, Y)
            permsdic[a_star][0] = fn_logposterior(d_star, R_star, N_star, alpha, logalpha_pmf, logbeta_pmf, maxlhs,
                                                  beta_Z, nruleslen, lhs_len)
        # Compute the metropolis acceptance probability
        q = exp(permsdic[a_star][0] - permsdic[a_t][0] + Jratio)
        u = random.random()
        if u < q:
            # then we accept the move
            d_t = list(d_star)
            R_t = int(R_star)
            a_t = a_star
            # else: pass
        if itr > burnin and itr % thinning == 0:
            ##store
            permsdic[a_t][1] += 1
            perms.append(a_t)
    return permsdic, perms


def initialize_d(X, Y, lbda, eta, lhs_len, maxlhs, nruleslen):
    '''Samples a list from the prior
    '''
    m = Inf
    while m >= len(X):
        m = poisson.rvs(lbda)  # sample the length of the list from Poisson(lbda), truncated at len(X)
    # prepare the list
    d_t = []
    empty_rulelens = [r for r in range(1, maxlhs + 1) if r not in nruleslen]
    used_rules = []
    for i in range(m):
        # Sample a rule size.
        r = 0
        while r == 0 or r > maxlhs or r in empty_rulelens:
            r = poisson.rvs(
                eta)  # Sample the rule size from Poisson(eta), truncated at 0 and maxlhs and not using empty rule lens
        # Now sample a rule of that size uniformly at random
        rule_cands = [j for j, lhslen in enumerate(lhs_len) if lhslen == r and j not in used_rules]
        random.shuffle(rule_cands)
        j = rule_cands[0]
        # And add it in
        d_t.append(j)
        used_rules.append(j)
        assert lhs_len[j] == r
        if len(rule_cands) == 1:
            empty_rulelens.append(r)
    # Done adding rules. We have added m rules. Finish up.
    d_t.append(0)  # all done
    d_t.extend([i for i in range(len(X)) if i not in d_t])
    R_t = d_t.index(0)
    assert R_t == m
    # Figure out what rules are used to classify what points
    N_t = compute_rule_usage(d_t, R_t, X, Y)
    return d_t, R_t, N_t


def proposal(d_t, R_t, X, Y, alpha):
    '''Propose a new d_star
    '''
    d_star = list(d_t)
    R_star = int(R_t)
    # We begin with these as the move probabilities, but will renormalize as needed if certain moves are unavailable.
    move_probs_default = array([0.3333333333, 0.3333333333, 0.3333333333])
    # We have 3 moves: move, add, cut. Define the pdf for the probabilities of the moves, in that order:
    if R_t == 0:
        # List is empty. We must add.
        move_probs = array([0., 1., 0.])
        # This is an add transition. The probability of the reverse cut move is the prob of a list of len 1 having
        # a cut (other option for list of len 1 is an add).
        Jratios = array([0., move_probs_default[2] / float(move_probs_default[1] + move_probs_default[2]), 0.])
    elif R_t == 1:
        # List has one rule on it. We cannot move, must add or cut.
        move_probs = array(move_probs_default)  # copy
        move_probs[0] = 0.  # drop move move.
        move_probs = move_probs / sum(move_probs)  # renormalize
        # If add, probability of the reverse cut is the default cut probability
        # If cut, probability of the reverse add is 1.
        inv_move_probs = array([0., move_probs_default[2], 1.])
        Jratios = zeros_like(move_probs)
        Jratios[1:] = inv_move_probs[1:] / move_probs[1:]  # array elementwise division
    elif R_t == len(d_t) - 1:
        # List has all rules on it. We cannot add, must move or cut.
        move_probs = array(move_probs_default)  # copy
        move_probs[1] = 0.  # drop add move.
        move_probs = move_probs / sum(move_probs)  # renormalize
        # If move, probability of reverse move is move_probs[0], so Jratio = 1.
        # if cut, probability of reverse add is move_probs_default
        Jratios = array([1., 0., move_probs_default[1] / move_probs[2]])
    elif R_t == len(d_t) - 2:
        # List has all rules but 1 on it.
        # Move probabilities are the default, but the inverse are a little different.
        move_probs = array(move_probs_default)
        # If move, probability of reverse move is still default, so Jratio = 1.
        # if cut, probability of reverse add is move_probs_default[1],
        # if add, probability of reverse cut is,
        Jratios = array([1., move_probs_default[2] / float(move_probs_default[0] + move_probs_default[2]) / float(
            move_probs_default[1]), move_probs_default[1] / float(move_probs_default[2])])
    else:
        move_probs = array(move_probs_default)
        Jratios = array([1., move_probs[2] / float(move_probs[1]), move_probs[1] / float(move_probs[2])])
    u = random.random()
    # First we will find the indices for the insertion-deletion. indx1 is the item to be moved, indx2 is the new location
    if u < sum(move_probs[:1]):
        # This is an on-list move.
        step = 'move'
        [indx1, indx2] = random.permutation(range(len(d_t[:R_t])))[:2]  # value error if there are no on list entries
        # print 'move',indx1,indx2
        Jratio = Jratios[0]  # ratio of move/move probabilities is 1.
    elif u < sum(move_probs[:2]):
        # this is an add
        step = 'add'
        indx1 = R_t + 1 + random.randint(0, len(
            d_t[R_t + 1:]))  # this will throw ValueError if there are no off list entries
        indx2 = random.randint(0, len(d_t[:R_t + 1]))  # this one will always work
        # print 'add',indx1,indx2
        # the probability of going from d_star back to d_t is the probability of the corresponding cut.
        # p(d*->d|cut) = 1/|d*| = 1/(|d|+1) = 1./float(R_t+1)
        # p(d->d*|add) = 1/((|a|-|d|)(|d|+1)) = 1./(float(len(d_t)-1-R_t)*float(R_t+1))
        Jratio = Jratios[1] * float(len(d_t) - 1 - R_t)
        R_star += 1
    elif u < sum(move_probs[:3]):
        # this is a cut
        step = 'cut'
        indx1 = random.randint(0, len(d_t[:R_t]))  # this will throw ValueError if there are no on list entries
        indx2 = R_t + random.randint(0, len(d_t[R_t:]))  # this one will always work
        # print 'cut',indx1,indx2
        # the probability of going from d_star back to d_t is the probability of the corresponding add.
        # p(d*->d|add) = 1/((|a|-|d*|)(|d*|+1)) = 1/((|a|-|d|+1)(|d|))
        # p(d->d*|cut) = 1/|d|
        # Jratio =
        Jratio = Jratios[2] * (1. / float(len(d_t) - 1 - R_t + 1))
        R_star -= 1
    else:
        raise Exception
    # Now do the insertion-deletion
    d_star.insert(indx2, d_star.pop(indx1))
    return d_star, log(Jratio), R_star, step


def prior_calculations(lbda, maxlen, eta, maxlhs):
    '''Compute the normalization constants for the prior on rule cardinality
    '''
    # First normalization constants for beta
    beta_Z = poisson.cdf(maxlhs, eta) - poisson.pmf(0, eta)
    # Then the actual un-normalized pmfs
    logalpha_pmf = {}
    for i in range(maxlen + 1):
        try:
            logalpha_pmf[i] = poisson.logpmf(i, lbda)
        except RuntimeWarning:
            logalpha_pmf[i] = -inf
    logbeta_pmf = {}
    for i in range(1, maxlhs + 1):
        logbeta_pmf[i] = poisson.logpmf(i, eta)
    return beta_Z, logalpha_pmf, logbeta_pmf


def fn_logposterior(d_t, R_t, N_t, alpha, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen, lhs_len):
    '''# Compute log posterior
    '''
    loglikelihood = fn_loglikelihood(d_t, N_t, R_t, alpha)
    logprior = fn_logprior(d_t, R_t, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen, lhs_len)
    return loglikelihood + logprior


def fn_loglikelihood(d_t, N_t, R_t, alpha):
    '''Compute log likelihood
    '''
    gammaln_Nt_jk = gammaln(N_t + alpha)
    gammaln_Nt_j = gammaln(sum(N_t + alpha, 1))
    loglikelihood = sum(gammaln_Nt_jk) - sum(gammaln_Nt_j)
    return loglikelihood


def fn_logprior(d_t, R_t, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen, lhs_len):
    '''# Compute log prior
    The prior will be _proportional_ to this -> we drop the normalization for alpha
    beta_Z is the normalization for beta, except the terms that need to be dropped due to running out of rules.
    log p(d_star) = log \alpha(m|lbda) + sum_{i=1...m} log beta(l_i | eta) + log gamma(r_i | l_i)
    The length of the list (m) is R_t
    Get logalpha (length of list) (overloaded notation in this code, unrelated to the prior hyperparameter alpha)
    '''
    logprior = 0.
    logalpha = logalpha_pmf[
        R_t]  # this is proportional to logalpha - we have dropped the normalization for truncating based on total number of rules
    logprior += logalpha
    empty_rulelens = []
    nlens = zeros(maxlhs + 1)
    for i in range(R_t):
        l_i = lhs_len[d_t[i]]
        logbeta = logbeta_pmf[l_i] - log(
            beta_Z - sum([logbeta_pmf[l_j] for l_j in empty_rulelens]))  # The correction for exhausted rule lengths
        # Finally loggamma
        loggamma = -log(nruleslen[l_i] - nlens[l_i])
        # And now check if we have exhausted all rules of a certain size
        nlens[l_i] += 1
        if nlens[l_i] == nruleslen[l_i]:
            empty_rulelens.append(l_i)
        elif nlens[l_i] > nruleslen[l_i]:
            raise Exception
        # Add 'em in
        logprior += logbeta
        logprior += loggamma
    # All done
    return logprior


def compute_rule_usage(d_star, R_star, X, Y):
    '''Compute which rules are being used to classify data points with what labels
    '''
    N_star = zeros((R_star + 1, Y.shape[1]))
    remaining_unused = set(range(Y.shape[0]))
    i = 0
    while remaining_unused:
        j = d_star[i]
        usedj = remaining_unused.intersection(X[j])
        remaining_unused = remaining_unused.difference(set(usedj))
        N_star[i, :] = Y[list(usedj), :].sum(0)
        i += 1
    if int(sum(N_star)) != Y.shape[0]:
        raise Exception  # bug check
    return N_star

Functions

def bayesdl_mcmc(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin, rseed, d_init)

Run Metropolis-Hastings algorithm

Expand source code
def bayesdl_mcmc(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin, rseed,
                 d_init):
    '''Run Metropolis-Hastings algorithm
    '''
    # initialize
    perms = []
    if rseed:
        random.seed(rseed)

    # Do some pre-computation for the prior
    beta_Z, logalpha_pmf, logbeta_pmf = prior_calculations(lbda, len(X), eta, maxlhs)
    if d_init:  # If we want to begin our chain at a specific place (e.g. to continue a chain)
        d_t = Pickle.loads(d_init)
        d_t.extend([i for i in range(len(X)) if i not in d_t])
        R_t = d_t.index(0)
        N_t = compute_rule_usage(d_t, R_t, X, Y)
    else:
        d_t, R_t, N_t = initialize_d(X, Y, lbda, eta, lhs_len, maxlhs,
                                     nruleslen)  # Otherwise sample the initial value from the prior

    # Add to dictionary which will store the sampling results
    a_t = Pickle.dumps(d_t[:R_t + 1])  # The antecedent list in string form
    if a_t not in permsdic:
        permsdic[a_t][0] = fn_logposterior(d_t, R_t, N_t, alpha, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen,
                                           lhs_len)  # Compute its logposterior
    if burnin == 0:
        permsdic[a_t][1] += 1  # store the initialization sample

    # iterate!
    for itr in range(numiters):
        # Sample from proposal distribution
        d_star, Jratio, R_star, step = proposal(d_t, R_t, X, Y, alpha)
        # Compute the new posterior value, if necessary
        a_star = Pickle.dumps(d_star[:R_star + 1])
        if a_star not in permsdic:
            N_star = compute_rule_usage(d_star, R_star, X, Y)
            permsdic[a_star][0] = fn_logposterior(d_star, R_star, N_star, alpha, logalpha_pmf, logbeta_pmf, maxlhs,
                                                  beta_Z, nruleslen, lhs_len)
        # Compute the metropolis acceptance probability
        q = exp(permsdic[a_star][0] - permsdic[a_t][0] + Jratio)
        u = random.random()
        if u < q:
            # then we accept the move
            d_t = list(d_star)
            R_t = int(R_star)
            a_t = a_star
            # else: pass
        if itr > burnin and itr % thinning == 0:
            ##store
            permsdic[a_t][1] += 1
            perms.append(a_t)
    return permsdic, perms
def compute_rule_usage(d_star, R_star, X, Y)

Compute which rules are being used to classify data points with what labels

Expand source code
def compute_rule_usage(d_star, R_star, X, Y):
    '''Compute which rules are being used to classify data points with what labels
    '''
    N_star = zeros((R_star + 1, Y.shape[1]))
    remaining_unused = set(range(Y.shape[0]))
    i = 0
    while remaining_unused:
        j = d_star[i]
        usedj = remaining_unused.intersection(X[j])
        remaining_unused = remaining_unused.difference(set(usedj))
        N_star[i, :] = Y[list(usedj), :].sum(0)
        i += 1
    if int(sum(N_star)) != Y.shape[0]:
        raise Exception  # bug check
    return N_star
def default_permsdic()

For producing the defaultdict used for storing MCMC results

Expand source code
def default_permsdic():
    '''For producing the defaultdict used for storing MCMC results
    '''
    return [0., 0.]
def fn_loglikelihood(d_t, N_t, R_t, alpha)

Compute log likelihood

Expand source code
def fn_loglikelihood(d_t, N_t, R_t, alpha):
    '''Compute log likelihood
    '''
    gammaln_Nt_jk = gammaln(N_t + alpha)
    gammaln_Nt_j = gammaln(sum(N_t + alpha, 1))
    loglikelihood = sum(gammaln_Nt_jk) - sum(gammaln_Nt_j)
    return loglikelihood
def fn_logposterior(d_t, R_t, N_t, alpha, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen, lhs_len)

Compute log posterior

Expand source code
def fn_logposterior(d_t, R_t, N_t, alpha, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen, lhs_len):
    '''# Compute log posterior
    '''
    loglikelihood = fn_loglikelihood(d_t, N_t, R_t, alpha)
    logprior = fn_logprior(d_t, R_t, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen, lhs_len)
    return loglikelihood + logprior
def fn_logprior(d_t, R_t, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen, lhs_len)

Compute log prior

The prior will be proportional to this -> we drop the normalization for alpha beta_Z is the normalization for beta, except the terms that need to be dropped due to running out of rules. log p(d_star) = log lpha(m|lbda) + sum_{i=1…m} log beta(l_i | eta) + log gamma(r_i | l_i) The length of the list (m) is R_t Get logalpha (length of list) (overloaded notation in this code, unrelated to the prior hyperparameter alpha)

Expand source code
def fn_logprior(d_t, R_t, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen, lhs_len):
    '''# Compute log prior
    The prior will be _proportional_ to this -> we drop the normalization for alpha
    beta_Z is the normalization for beta, except the terms that need to be dropped due to running out of rules.
    log p(d_star) = log \alpha(m|lbda) + sum_{i=1...m} log beta(l_i | eta) + log gamma(r_i | l_i)
    The length of the list (m) is R_t
    Get logalpha (length of list) (overloaded notation in this code, unrelated to the prior hyperparameter alpha)
    '''
    logprior = 0.
    logalpha = logalpha_pmf[
        R_t]  # this is proportional to logalpha - we have dropped the normalization for truncating based on total number of rules
    logprior += logalpha
    empty_rulelens = []
    nlens = zeros(maxlhs + 1)
    for i in range(R_t):
        l_i = lhs_len[d_t[i]]
        logbeta = logbeta_pmf[l_i] - log(
            beta_Z - sum([logbeta_pmf[l_j] for l_j in empty_rulelens]))  # The correction for exhausted rule lengths
        # Finally loggamma
        loggamma = -log(nruleslen[l_i] - nlens[l_i])
        # And now check if we have exhausted all rules of a certain size
        nlens[l_i] += 1
        if nlens[l_i] == nruleslen[l_i]:
            empty_rulelens.append(l_i)
        elif nlens[l_i] > nruleslen[l_i]:
            raise Exception
        # Add 'em in
        logprior += logbeta
        logprior += loggamma
    # All done
    return logprior
def gelmanrubin(res)

Check convergence with GR diagnostic

Expand source code
def gelmanrubin(res):
    '''Check convergence with GR diagnostic
    '''
    n = 0  # number of samples per chain - to be computed
    m = len(res)  # number of chains
    phi_bar_j = {}
    for chain in res:
        phi_bar_j[chain] = 0.
        for val in res[chain]['permsdic'].values():
            phi_bar_j[chain] += val[1] * val[0]  # numsamples*log posterior
            n += val[1]
    # And _normalize
    n = n // m  # Number of samples per chain (assuming all m chains have same number of samples)
    # Normalize, and compute phi_bar
    phi_bar = 0.
    for chain in phi_bar_j:
        phi_bar_j[chain] = phi_bar_j[chain] / float(n)  # _normalize
        phi_bar += phi_bar_j[chain]
    phi_bar = phi_bar / float(m)  # phi_bar = average of phi_bar_j
    # Now B
    B = 0.
    for chain in phi_bar_j:
        B += (phi_bar_j[chain] - phi_bar) ** 2
    B = B * (n / float(m - 1))
    # Now W.
    W = 0.
    for chain in res:
        s2_j = 0.
        for val in res[chain]['permsdic'].values():
            s2_j += val[1] * (val[0] - phi_bar_j[chain]) ** 2
        s2_j = (1. / float(n - 1)) * s2_j
        W += s2_j
    W = W * (1. / float(m))
    # Next varhat
    varhat = ((n - 1) / float(n)) * W + (1. / float(n)) * B
    # And finally,
    try:
        Rhat = sqrt(varhat / float(W))
    except:
        print('RuntimeWarning computing Rhat, W=' + str(W) + ', B=' + str(B))
        Rhat = 0.
    return Rhat
def get_point_estimate(permsdic, lhs_len, X, Y, alpha, nruleslen, maxlhs, lbda, eta, verbose=True)

Get a point estimate with length and width similar to the posterior average, with highest likelihood

Expand source code
def get_point_estimate(permsdic, lhs_len, X, Y, alpha, nruleslen, maxlhs, lbda, eta, verbose=True):
    '''Get a point estimate with length and width similar to the posterior average, with highest likelihood
    '''
    # Figure out the posterior expected list length and average rule size
    listlens = []
    rulesizes = []
    for perm in permsdic:
        #         with open(perm, 'rb') as file:
        #             d_t = pickle.loads(file)
        #         print('perm', perm, type(perm))
        #         print('perm list', list(perm))
        #         d_t = Pickle.loads(bytes(perm, encoding="latin1")) #, encoding='bytes')
        d_t = Pickle.loads(perm)  # , encoding='bytes')

        listlens.extend([len(d_t)] * int(permsdic[perm][1]))
        rulesizes.extend([lhs_len[j] for j in d_t[:-1]] * int(permsdic[perm][1]))

    # Now compute average
    avglistlen = average(listlens)
    if verbose:
        print('Posterior average length:', avglistlen)
    try:
        avgrulesize = average(rulesizes)
        if verbose:
            print('Posterior average width:', avgrulesize)
        # Prepare the intervals
        minlen = int(floor(avglistlen))
        maxlen = int(ceil(avglistlen))
        minrulesize = int(floor(avgrulesize))
        maxrulesize = int(ceil(avgrulesize))
        # Run through all perms again
        likelihoods = []
        d_ts = []
        beta_Z, logalpha_pmf, logbeta_pmf = prior_calculations(lbda, len(X), eta,
                                                               maxlhs)  # get the constants needed to compute the prior
        for perm in permsdic:
            if permsdic[perm][1] > 0:
                d_t = Pickle.loads(perm)  # this is the antecedent list

                # Check the list length
                if len(d_t) >= minlen and len(d_t) <= maxlen:

                    # Check the rule size
                    rulesize = average([lhs_len[j] for j in d_t[:-1]])
                    if rulesize >= minrulesize and rulesize <= maxrulesize:
                        d_ts.append(d_t)

                        # Compute the likelihood
                        R_t = d_t.index(0)
                        N_t = compute_rule_usage(d_t, R_t, X, Y)
                        likelihoods.append(
                            fn_logposterior(d_t, R_t, N_t, alpha, logalpha_pmf, logbeta_pmf, maxlhs, beta_Z, nruleslen,
                                            lhs_len))
        likelihoods = array(likelihoods)
        d_star = d_ts[likelihoods.argmax()]
    except RuntimeWarning:
        # This can happen if all perms are identically [0], or if no soln is found within the len and width bounds (probably the chains didn't converge)
        print('No suitable point estimate found')
        d_star = None
    return d_star
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals)

Compute the posterior consequent distributions (Basically compute points in each part of rule)

Expand source code
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals):
    '''Compute the posterior consequent distributions
    (Basically compute points in each part of rule)
    '''
    N_t = compute_rule_usage(d_t, d_t.index(0), Xtrain, Ytrain)
    theta = []  # P(Y=1)
    ci_theta = []  # confidence interval for Y=1
    for i, j in enumerate(d_t):
        # theta ~ Dirichlet(n_rules[j,:] + alpha)
        # E[theta] = (n_rules[j,:] + alpha)/float(sum(n_rules[j,:] + alpha))
        # NOTE this result is only for binary classification
        # theta = p(y=1)
        theta.append((N_t[i, 1] + alpha[1]) / float(sum(N_t[i, :] + alpha)))
        # And now the 95% interval, for Beta(n_rules[j,1] + alpha[1], n_rules[j,0] + alpha[0])
        if intervals:
            ci_theta.append(beta.interval(0.95, N_t[i, 1] + alpha[1], N_t[i, 0] + alpha[0]))
    return theta, ci_theta
def initialize_d(X, Y, lbda, eta, lhs_len, maxlhs, nruleslen)

Samples a list from the prior

Expand source code
def initialize_d(X, Y, lbda, eta, lhs_len, maxlhs, nruleslen):
    '''Samples a list from the prior
    '''
    m = Inf
    while m >= len(X):
        m = poisson.rvs(lbda)  # sample the length of the list from Poisson(lbda), truncated at len(X)
    # prepare the list
    d_t = []
    empty_rulelens = [r for r in range(1, maxlhs + 1) if r not in nruleslen]
    used_rules = []
    for i in range(m):
        # Sample a rule size.
        r = 0
        while r == 0 or r > maxlhs or r in empty_rulelens:
            r = poisson.rvs(
                eta)  # Sample the rule size from Poisson(eta), truncated at 0 and maxlhs and not using empty rule lens
        # Now sample a rule of that size uniformly at random
        rule_cands = [j for j, lhslen in enumerate(lhs_len) if lhslen == r and j not in used_rules]
        random.shuffle(rule_cands)
        j = rule_cands[0]
        # And add it in
        d_t.append(j)
        used_rules.append(j)
        assert lhs_len[j] == r
        if len(rule_cands) == 1:
            empty_rulelens.append(r)
    # Done adding rules. We have added m rules. Finish up.
    d_t.append(0)  # all done
    d_t.extend([i for i in range(len(X)) if i not in d_t])
    R_t = d_t.index(0)
    assert R_t == m
    # Figure out what rules are used to classify what points
    N_t = compute_rule_usage(d_t, R_t, X, Y)
    return d_t, R_t, N_t
def mcmcchain(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin, nchains, d_init)

Run and store mcmc chain

Expand source code
def mcmcchain(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin, nchains,
              d_init):
    '''Run and store mcmc chain
    '''
    res = {}
    permsdic, res['perms'] = bayesdl_mcmc(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs,
                                          permsdic, burnin, None, d_init)
    # Store the permsdic results
    res['permsdic'] = {perm: list(vals) for perm, vals in permsdic.items() if vals[1] > 0}
    # Reset the permsdic
    permsdic = reset_permsdic(permsdic)
    return res
def merge_chains(res)

Merge chains into a single collection of posterior samples

Expand source code
def merge_chains(res):
    '''Merge chains into a single collection of posterior samples
    '''
    permsdic = defaultdict(default_permsdic)
    for n in res:
        for perm, vals in res[n]['permsdic'].items():
            permsdic[perm][0] = vals[0]
            permsdic[perm][1] += vals[1]
    return permsdic
def plot_chains(res)

Plot the logposterior values for the samples in the chains.

Expand source code
def plot_chains(res):
    '''Plot the logposterior values for the samples in the chains.
    '''
    for chain in res:
        plt.plot([res[chain]['permsdic'][a][0] for a in res[chain]['perms']])
    plt.show()
    return
def preds_d_t(X, Y, d_t, theta)

Get predictions from the list d_t

Expand source code
def preds_d_t(X, Y, d_t, theta):
    '''Get predictions from the list d_t
    '''
    # this is binary only. The score is the Prob of 1.
    unused = set(range(Y.shape[0]))
    preds = -1 * ones(Y.shape[0])
    for i, j in enumerate(d_t):
        usedj = unused.intersection(X[j])  # these are the observations in X that make it to rule j
        preds[list(usedj)] = theta[i]
        unused = unused.difference(set(usedj))
    if preds.min() < 0:
        raise Exception  # this means some observation wasn't given a prediction - shouldn't happen
    return preds
def prior_calculations(lbda, maxlen, eta, maxlhs)

Compute the normalization constants for the prior on rule cardinality

Expand source code
def prior_calculations(lbda, maxlen, eta, maxlhs):
    '''Compute the normalization constants for the prior on rule cardinality
    '''
    # First normalization constants for beta
    beta_Z = poisson.cdf(maxlhs, eta) - poisson.pmf(0, eta)
    # Then the actual un-normalized pmfs
    logalpha_pmf = {}
    for i in range(maxlen + 1):
        try:
            logalpha_pmf[i] = poisson.logpmf(i, lbda)
        except RuntimeWarning:
            logalpha_pmf[i] = -inf
    logbeta_pmf = {}
    for i in range(1, maxlhs + 1):
        logbeta_pmf[i] = poisson.logpmf(i, eta)
    return beta_Z, logalpha_pmf, logbeta_pmf
def proposal(d_t, R_t, X, Y, alpha)

Propose a new d_star

Expand source code
def proposal(d_t, R_t, X, Y, alpha):
    '''Propose a new d_star
    '''
    d_star = list(d_t)
    R_star = int(R_t)
    # We begin with these as the move probabilities, but will renormalize as needed if certain moves are unavailable.
    move_probs_default = array([0.3333333333, 0.3333333333, 0.3333333333])
    # We have 3 moves: move, add, cut. Define the pdf for the probabilities of the moves, in that order:
    if R_t == 0:
        # List is empty. We must add.
        move_probs = array([0., 1., 0.])
        # This is an add transition. The probability of the reverse cut move is the prob of a list of len 1 having
        # a cut (other option for list of len 1 is an add).
        Jratios = array([0., move_probs_default[2] / float(move_probs_default[1] + move_probs_default[2]), 0.])
    elif R_t == 1:
        # List has one rule on it. We cannot move, must add or cut.
        move_probs = array(move_probs_default)  # copy
        move_probs[0] = 0.  # drop move move.
        move_probs = move_probs / sum(move_probs)  # renormalize
        # If add, probability of the reverse cut is the default cut probability
        # If cut, probability of the reverse add is 1.
        inv_move_probs = array([0., move_probs_default[2], 1.])
        Jratios = zeros_like(move_probs)
        Jratios[1:] = inv_move_probs[1:] / move_probs[1:]  # array elementwise division
    elif R_t == len(d_t) - 1:
        # List has all rules on it. We cannot add, must move or cut.
        move_probs = array(move_probs_default)  # copy
        move_probs[1] = 0.  # drop add move.
        move_probs = move_probs / sum(move_probs)  # renormalize
        # If move, probability of reverse move is move_probs[0], so Jratio = 1.
        # if cut, probability of reverse add is move_probs_default
        Jratios = array([1., 0., move_probs_default[1] / move_probs[2]])
    elif R_t == len(d_t) - 2:
        # List has all rules but 1 on it.
        # Move probabilities are the default, but the inverse are a little different.
        move_probs = array(move_probs_default)
        # If move, probability of reverse move is still default, so Jratio = 1.
        # if cut, probability of reverse add is move_probs_default[1],
        # if add, probability of reverse cut is,
        Jratios = array([1., move_probs_default[2] / float(move_probs_default[0] + move_probs_default[2]) / float(
            move_probs_default[1]), move_probs_default[1] / float(move_probs_default[2])])
    else:
        move_probs = array(move_probs_default)
        Jratios = array([1., move_probs[2] / float(move_probs[1]), move_probs[1] / float(move_probs[2])])
    u = random.random()
    # First we will find the indices for the insertion-deletion. indx1 is the item to be moved, indx2 is the new location
    if u < sum(move_probs[:1]):
        # This is an on-list move.
        step = 'move'
        [indx1, indx2] = random.permutation(range(len(d_t[:R_t])))[:2]  # value error if there are no on list entries
        # print 'move',indx1,indx2
        Jratio = Jratios[0]  # ratio of move/move probabilities is 1.
    elif u < sum(move_probs[:2]):
        # this is an add
        step = 'add'
        indx1 = R_t + 1 + random.randint(0, len(
            d_t[R_t + 1:]))  # this will throw ValueError if there are no off list entries
        indx2 = random.randint(0, len(d_t[:R_t + 1]))  # this one will always work
        # print 'add',indx1,indx2
        # the probability of going from d_star back to d_t is the probability of the corresponding cut.
        # p(d*->d|cut) = 1/|d*| = 1/(|d|+1) = 1./float(R_t+1)
        # p(d->d*|add) = 1/((|a|-|d|)(|d|+1)) = 1./(float(len(d_t)-1-R_t)*float(R_t+1))
        Jratio = Jratios[1] * float(len(d_t) - 1 - R_t)
        R_star += 1
    elif u < sum(move_probs[:3]):
        # this is a cut
        step = 'cut'
        indx1 = random.randint(0, len(d_t[:R_t]))  # this will throw ValueError if there are no on list entries
        indx2 = R_t + random.randint(0, len(d_t[R_t:]))  # this one will always work
        # print 'cut',indx1,indx2
        # the probability of going from d_star back to d_t is the probability of the corresponding add.
        # p(d*->d|add) = 1/((|a|-|d*|)(|d*|+1)) = 1/((|a|-|d|+1)(|d|))
        # p(d->d*|cut) = 1/|d|
        # Jratio =
        Jratio = Jratios[2] * (1. / float(len(d_t) - 1 - R_t + 1))
        R_star -= 1
    else:
        raise Exception
    # Now do the insertion-deletion
    d_star.insert(indx2, d_star.pop(indx1))
    return d_star, log(Jratio), R_star, step
def reset_permsdic(permsdic)

Resets the number of MCMC samples stored (value[1]) while maintaining the log-posterior value (so it doesn't need to be re-computed in future chains).

Expand source code
def reset_permsdic(permsdic):
    '''Resets the number of MCMC samples stored (value[1]) while maintaining the
    log-posterior value (so it doesn't need to be re-computed in future chains).
    '''
    for perm in permsdic:
        permsdic[perm][1] = 0.
    return permsdic
def run_bdl_multichain_serial(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin, nchains, d_inits, verbose=True, seed=42)

Run mcmc for each of the chains in serial

Expand source code
def run_bdl_multichain_serial(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin,
                              nchains, d_inits, verbose=True, seed=42):
    '''Run mcmc for each of the chains in serial
    '''
    # random seed
    random.seed(seed)

    # Run each chain
    t1 = time.process_time()
    if verbose:
        print('Starting mcmc chains')
    res = {}
    for n in range(nchains):
        res[n] = mcmcchain(numiters, thinning, alpha, lbda, eta, X, Y, nruleslen, lhs_len, maxlhs, permsdic, burnin,
                           nchains, d_inits[n])

    if verbose:
        print('Elapsed CPU time', time.process_time() - t1)

    # Check convergence
    Rhat = gelmanrubin(res)

    if verbose:
        print('Rhat for convergence:', Rhat)
    ##plot?
    # plot_chains(res)
    return res, Rhat