Module `imodelsx.sasc.m1_ngrams`

Functions

def explain_ngrams(text_str_list: List[str], mod: Callable[[List[str]], numpy.ndarray[float]], ngrams: int = 3, all_ngrams: bool = True, num_top_ngrams: int = 75, use_cache: bool = True, cache_filename: str = None, noise_ngram_scores: float = 0, noise_seed: int = None, text_str_list_restrict: List[str] = None) ‑> Tuple[List[str], List[str]]

Expand source code

def explain_ngrams(
    text_str_list: List[str],
    mod: Callable[[List[str]], np.ndarray[float]],
    ngrams: int = 3,
    all_ngrams: bool = True,
    num_top_ngrams: int = 75,
    use_cache: bool = True,
    cache_filename: str = None,
    noise_ngram_scores: float = 0,
    noise_seed: int = None,
    text_str_list_restrict: List[str] = None,
) -> Tuple[List[str], List[str]]:
    """
    Params
    ------
    text_str_list: List[str]
        The list of text strings to use to extract ngrams
    mod: Callable[[List[str]], List[float]]
        The module to interpret
    ngrams: int
        The order of ngrams to use (3 is trigrams)
    all_ngrams: bool
        If True, use all ngrams up to ngrams. If False, use only ngrams
    num_top_ngrams: int
        The number of top ngrams to return
    use_cache: bool
        If True, use the cache
    cache_filename: str
        The filename to use for the module ngram cache
    noise_ngram_scores: float
        If > 0, add noise to the ngram scores
    noise_seed: int
        The seed to use for the ngram scores noise
    text_str_list_restrict: List[str]
        If not None, restrict the top ngrams to those that appear in this corpus

    Returns
    -------
    ngram_list: List[str]
        The top ngrams
    ngram_scores: List[float]
        The scores for each ngram

    Note: this caches the call that gets the scores
    """
    # get all ngrams
    tok = English(max_length=10e10)
    text_str = " ".join(text_str_list)
    ngrams_list = imodelsx.util.generate_ngrams_list(
        text_str, ngrams=ngrams, tokenizer_ngrams=tok, all_ngrams=all_ngrams
    )

    # get unique ngrams
    ngrams_list = sorted(list(set(ngrams_list)))
    # pkl.dump(ngrams_list, open(
    # cache_filename.replace('.pkl', '_ngrams.pkl'), "wb"))
    # exit(0)
    print(f'{ngrams_list=}')

    # compute scores and cache...
    use_cache = (
        use_cache and cache_filename
    )  # can only use cache if cache_filename is not None
    if use_cache and os.path.exists(cache_filename):
        ngram_scores = pkl.load(open(cache_filename, "rb"))
    else:
        # some modules have specialized parameters...
        # fmri should cache all preds together, since they are efficiently computed together
        call_parameters = inspect.signature(mod.__call__).parameters.keys()
        print("predicting all ngrams...")
        if "return_all" in call_parameters:
            ngram_scores = mod(ngrams_list, return_all=True)
        elif "calc_ngram" in call_parameters:
            ngram_scores = mod(ngrams_list, calc_ngram=True)
        else:
            ngram_scores = mod(ngrams_list)

        if use_cache:
            os.makedirs(dirname(cache_filename), exist_ok=True)
            # pkl.dump(ngrams_list, open(
            # cache_filename.replace('.pkl', '_ngrams.pkl'), "wb"))
            pkl.dump(ngram_scores, open(cache_filename, "wb"))

    # multidimensional predictions
    # this is rare, module should just return a scalar
    # but for fMRI voxels, we cached this as a full matrix and need to now select a column
    if isinstance(ngram_scores, list):
        ngram_scores = np.array(ngram_scores)
    if len(ngram_scores.shape) > 1 and ngram_scores.shape[1] > 1:
        ngram_scores = ngram_scores[:, mod.voxel_num_best]

    # add noise to ngram scores
    if noise_ngram_scores > 0:
        scores_top_100 = np.sort(ngram_scores)[::-1][:100]
        std_top_100 = np.std(scores_top_100)
        rng = np.random.default_rng(noise_seed)
        ngram_scores += rng.normal(
            scale=std_top_100 * noise_ngram_scores,
            size=ngram_scores.shape,
        )

    # restrict top ngrams to alternative corpus
    if text_str_list_restrict is not None:
        print("before", ngrams_list)
        ngrams_set_restrict = set(
            imodelsx.util.generate_ngrams_list(
                " ".join(text_str_list_restrict),
                ngrams=ngrams,
                tokenizer_ngrams=tok,
                all_ngrams=all_ngrams,
            )
        )
        idxs_to_keep = np.array(
            [i for i, ngram in enumerate(
                ngrams_list) if ngram in ngrams_set_restrict]
        )
        ngrams_list = [ngrams_list[i] for i in idxs_to_keep]
        ngram_scores = ngram_scores[idxs_to_keep]
        print("after", ngrams_list)

    # print(f'{ngram_scores=}')
    scores_top_idxs = np.argsort(ngram_scores)[::-1][:num_top_ngrams]
    scores_top = ngram_scores[scores_top_idxs]
    ngrams_top = np.array(ngrams_list)[scores_top_idxs]
    return ngrams_top.flatten().tolist(), scores_top.flatten().tolist()

Params

text_str_list: List[str] The list of text strings to use to extract ngrams mod: Callable[[List[str]], List[float]] The module to interpret ngrams: int The order of ngrams to use (3 is trigrams) all_ngrams: bool If True, use all ngrams up to ngrams. If False, use only ngrams num_top_ngrams: int The number of top ngrams to return use_cache: bool If True, use the cache cache_filename: str The filename to use for the module ngram cache noise_ngram_scores: float If > 0, add noise to the ngram scores noise_seed: int The seed to use for the ngram scores noise text_str_list_restrict: List[str] If not None, restrict the top ngrams to those that appear in this corpus

Returns

ngram_list : List[str]: The top ngrams
ngram_scores : List[float]: The scores for each ngram
Note : this caches the call that gets the scores