Module imodelsx.sasc.m1_ngrams
Expand source code
from typing import Callable, List, Tuple
import imodelsx
import numpy as np
from spacy.lang.en import English
from os.path import dirname, join
import os.path
import pickle as pkl
import inspect
def explain_ngrams(
text_str_list: List[str],
mod: Callable[[List[str]], np.ndarray[float]],
ngrams: int = 3,
all_ngrams: bool = True,
num_top_ngrams: int = 75,
use_cache: bool = True,
cache_filename: str = None,
noise_ngram_scores: float = 0,
noise_seed: int = None,
text_str_list_restrict: List[str] = None,
) -> Tuple[List[str], List[str]]:
"""
Params
------
text_str_list: List[str]
The list of text strings to use to extract ngrams
mod: Callable[[List[str]], List[float]]
The module to interpret
ngrams: int
The order of ngrams to use (3 is trigrams)
all_ngrams: bool
If True, use all ngrams up to ngrams. If False, use only ngrams
num_top_ngrams: int
The number of top ngrams to return
use_cache: bool
If True, use the cache
cache_filename: str
The filename to use for the module ngram cache
noise_ngram_scores: float
If > 0, add noise to the ngram scores
noise_seed: int
The seed to use for the ngram scores noise
text_str_list_restrict: List[str]
If not None, restrict the top ngrams to those that appear in this corpus
Returns
-------
ngram_list: List[str]
The top ngrams
ngram_scores: List[float]
The scores for each ngram
Note: this caches the call that gets the scores
"""
# get all ngrams
tok = English(max_length=10e10)
text_str = " ".join(text_str_list)
ngrams_list = imodelsx.util.generate_ngrams_list(
text_str, ngrams=ngrams, tokenizer_ngrams=tok, all_ngrams=all_ngrams
)
# get unique ngrams
ngrams_list = sorted(list(set(ngrams_list)))
# print(f'{ngrams_list=}')
# compute scores and cache...
use_cache = (
use_cache and cache_filename
) # can only use cache if cache_filename is not None
if use_cache and os.path.exists(cache_filename):
ngram_scores = pkl.load(open(cache_filename, "rb"))
else:
# some modules have specialized parameters...
# fmri should cache all preds together, since they are efficiently computed together
call_parameters = inspect.signature(mod.__call__).parameters.keys()
print("predicting all ngrams...")
if "return_all" in call_parameters:
ngram_scores = mod(ngrams_list, return_all=True)
elif "calc_ngram" in call_parameters:
ngram_scores = mod(ngrams_list, calc_ngram=True)
else:
ngram_scores = mod(ngrams_list)
if use_cache:
os.makedirs(dirname(cache_filename), exist_ok=True)
pkl.dump(ngram_scores, open(cache_filename, "wb"))
# multidimensional predictions
# this is rare, module should just return a scalar
# but for fMRI voxels, we cached this as a full matrix and need to now select a column
if isinstance(ngram_scores, list):
ngram_scores = np.array(ngram_scores)
if len(ngram_scores.shape) > 1 and ngram_scores.shape[1] > 1:
ngram_scores = ngram_scores[:, mod.voxel_num_best]
# add noise to ngram scores
if noise_ngram_scores > 0:
scores_top_100 = np.sort(ngram_scores)[::-1][:100]
std_top_100 = np.std(scores_top_100)
rng = np.random.default_rng(noise_seed)
ngram_scores += rng.normal(
scale=std_top_100 * noise_ngram_scores,
size=ngram_scores.shape,
)
# restrict top ngrams to alternative corpus
if text_str_list_restrict is not None:
print("before", ngrams_list)
ngrams_set_restrict = set(
imodelsx.util.generate_ngrams_list(
" ".join(text_str_list_restrict),
ngrams=ngrams,
tokenizer_ngrams=tok,
all_ngrams=all_ngrams,
)
)
idxs_to_keep = np.array(
[i for i, ngram in enumerate(ngrams_list) if ngram in ngrams_set_restrict]
)
ngrams_list = [ngrams_list[i] for i in idxs_to_keep]
ngram_scores = ngram_scores[idxs_to_keep]
print("after", ngrams_list)
# print(f'{ngram_scores=}')
scores_top_idxs = np.argsort(ngram_scores)[::-1][:num_top_ngrams]
scores_top = ngram_scores[scores_top_idxs]
ngrams_top = np.array(ngrams_list)[scores_top_idxs]
return ngrams_top.flatten().tolist(), scores_top.flatten().tolist()
if __name__ == "__main__":
def mod(X):
return np.arange(len(X)).astype(float)
class a:
noise_ngram_scores = 3
seed = 100
module_num = 0
module_num_restrict = -1
explanation = explain_ngrams(
a(),
["and", "i1", "i2", "i3", "i4"],
mod,
use_cache=False,
)
print(explanation)
Functions
def explain_ngrams(text_str_list: List[str], mod: Callable[[List[str]], numpy.ndarray[float]], ngrams: int = 3, all_ngrams: bool = True, num_top_ngrams: int = 75, use_cache: bool = True, cache_filename: str = None, noise_ngram_scores: float = 0, noise_seed: int = None, text_str_list_restrict: List[str] = None) ‑> Tuple[List[str], List[str]]
-
Params
text_str_list: List[str] The list of text strings to use to extract ngrams mod: Callable[[List[str]], List[float]] The module to interpret ngrams: int The order of ngrams to use (3 is trigrams) all_ngrams: bool If True, use all ngrams up to ngrams. If False, use only ngrams num_top_ngrams: int The number of top ngrams to return use_cache: bool If True, use the cache cache_filename: str The filename to use for the module ngram cache noise_ngram_scores: float If > 0, add noise to the ngram scores noise_seed: int The seed to use for the ngram scores noise text_str_list_restrict: List[str] If not None, restrict the top ngrams to those that appear in this corpus
Returns
ngram_list
:List[str]
- The top ngrams
ngram_scores
:List[float]
- The scores for each ngram
Note
:this caches the call that gets the scores
Expand source code
def explain_ngrams( text_str_list: List[str], mod: Callable[[List[str]], np.ndarray[float]], ngrams: int = 3, all_ngrams: bool = True, num_top_ngrams: int = 75, use_cache: bool = True, cache_filename: str = None, noise_ngram_scores: float = 0, noise_seed: int = None, text_str_list_restrict: List[str] = None, ) -> Tuple[List[str], List[str]]: """ Params ------ text_str_list: List[str] The list of text strings to use to extract ngrams mod: Callable[[List[str]], List[float]] The module to interpret ngrams: int The order of ngrams to use (3 is trigrams) all_ngrams: bool If True, use all ngrams up to ngrams. If False, use only ngrams num_top_ngrams: int The number of top ngrams to return use_cache: bool If True, use the cache cache_filename: str The filename to use for the module ngram cache noise_ngram_scores: float If > 0, add noise to the ngram scores noise_seed: int The seed to use for the ngram scores noise text_str_list_restrict: List[str] If not None, restrict the top ngrams to those that appear in this corpus Returns ------- ngram_list: List[str] The top ngrams ngram_scores: List[float] The scores for each ngram Note: this caches the call that gets the scores """ # get all ngrams tok = English(max_length=10e10) text_str = " ".join(text_str_list) ngrams_list = imodelsx.util.generate_ngrams_list( text_str, ngrams=ngrams, tokenizer_ngrams=tok, all_ngrams=all_ngrams ) # get unique ngrams ngrams_list = sorted(list(set(ngrams_list))) # print(f'{ngrams_list=}') # compute scores and cache... use_cache = ( use_cache and cache_filename ) # can only use cache if cache_filename is not None if use_cache and os.path.exists(cache_filename): ngram_scores = pkl.load(open(cache_filename, "rb")) else: # some modules have specialized parameters... # fmri should cache all preds together, since they are efficiently computed together call_parameters = inspect.signature(mod.__call__).parameters.keys() print("predicting all ngrams...") if "return_all" in call_parameters: ngram_scores = mod(ngrams_list, return_all=True) elif "calc_ngram" in call_parameters: ngram_scores = mod(ngrams_list, calc_ngram=True) else: ngram_scores = mod(ngrams_list) if use_cache: os.makedirs(dirname(cache_filename), exist_ok=True) pkl.dump(ngram_scores, open(cache_filename, "wb")) # multidimensional predictions # this is rare, module should just return a scalar # but for fMRI voxels, we cached this as a full matrix and need to now select a column if isinstance(ngram_scores, list): ngram_scores = np.array(ngram_scores) if len(ngram_scores.shape) > 1 and ngram_scores.shape[1] > 1: ngram_scores = ngram_scores[:, mod.voxel_num_best] # add noise to ngram scores if noise_ngram_scores > 0: scores_top_100 = np.sort(ngram_scores)[::-1][:100] std_top_100 = np.std(scores_top_100) rng = np.random.default_rng(noise_seed) ngram_scores += rng.normal( scale=std_top_100 * noise_ngram_scores, size=ngram_scores.shape, ) # restrict top ngrams to alternative corpus if text_str_list_restrict is not None: print("before", ngrams_list) ngrams_set_restrict = set( imodelsx.util.generate_ngrams_list( " ".join(text_str_list_restrict), ngrams=ngrams, tokenizer_ngrams=tok, all_ngrams=all_ngrams, ) ) idxs_to_keep = np.array( [i for i, ngram in enumerate(ngrams_list) if ngram in ngrams_set_restrict] ) ngrams_list = [ngrams_list[i] for i in idxs_to_keep] ngram_scores = ngram_scores[idxs_to_keep] print("after", ngrams_list) # print(f'{ngram_scores=}') scores_top_idxs = np.argsort(ngram_scores)[::-1][:num_top_ngrams] scores_top = ngram_scores[scores_top_idxs] ngrams_top = np.array(ngrams_list)[scores_top_idxs] return ngrams_top.flatten().tolist(), scores_top.flatten().tolist()