Module `imodelsx.sasc.m2_summarize`

Expand source code

from typing import Any, List, Mapping, Optional, Tuple, Callable
import numpy as np
from imodelsx.llm import get_llm


def summarize_ngrams(
    llm: Callable[[str], str],
    ngrams_list: List[str],
    num_summaries: int = 2,
    prefix_str="Here is a list of phrases:",
    suffix_str="What is a common theme among these phrases?\nThe common theme among these phrases is",
    num_top_ngrams_to_use: int = 30,
    num_top_ngrams_to_consider: int = 50,
    seed: int = 0,
) -> Tuple[List[str], List[str]]:
    """Refine a keyphrase by making a call to the llm

    Params
    ------
    llm: Callable[[str], str]
        The llm to use
    ngrams_list: List[str]
        The list of ngrams to summarize
    num_summaries: int
        The number of summaries to generate
    prefix_str: str
        The prefix of the prompt string to use for the llm summarization
    suffix_str: str
        The suffix of the prompt string to use for the llm summarization
    num_top_ngrams_to_use: int
        The number of top ngrams to select
    num_top_ngrams_to_consider: int
        The number of top ngrams to consider selecting from
    seed: int
        The seed to use for the random number generator

    Returns
    -------
    summaries: List[str]
        The list of summaries
    summary_rationales: List[str]
        The list of summary rationales (when available)
    """
    rng = np.random.default_rng(seed)

    summaries = []
    summary_rationales = []
    for i in range(num_summaries):
        # randomly sample num_top_ngrams (preserving ordering)
        n_to_consider = min(num_top_ngrams_to_consider, len(ngrams_list))
        n_to_use = min(num_top_ngrams_to_use, n_to_consider)
        idxs = np.sort(rng.choice(n_to_consider, size=n_to_use, replace=False))
        bullet_list_ngrams = "- " + "\n- ".join(np.array(ngrams_list)[idxs])
        prompt = prefix_str + "\n\n" + bullet_list_ngrams + "\n\n" + suffix_str
        if i == 0:
            print("First prompt")
            print(prompt)
        summary = llm(prompt)

        # clean up summary
        summary, summary_rationale = clean_summary(summary)
        summaries.append(summary)
        summary_rationales.append(summary_rationale)

    # remove replicates
    idxs_replicate = [False]
    summaries_running = {summaries[0]}
    for i in range(1, len(summaries)):
        if summaries[i] in summaries_running:
            idxs_replicate.append(True)
        else:
            idxs_replicate.append(False)
        summaries_running.add(summaries[i])
    summaries = [s for i, s in enumerate(summaries) if not idxs_replicate[i]]
    summary_rationales = [
        s for i, s in enumerate(summary_rationales) if not idxs_replicate[i]
    ]

    return summaries, summary_rationales


def clean_summary(summary: str):
    summary = summary.strip().lower()

    # keep removing unnecessary prefixes
    modified_str = True
    while modified_str:
        modified_str = False
        for k in [
            "that",
            "they",
            "are",
            "all",
            "contain",
            "the",
            "use of",
            "related to",
            "involve",
            "some form of",
            "some kind of",
            "the use of",
            "describe",
            "refer to",
            "words",
            "word",
            "used to",
            "relate to",
        ]:
            if summary.startswith(k):
                summary = summary[len(k) :].strip()
                modified_str = True

    # remove unnecessary suffix
    if summary.endswith("."):
        summary = summary[:-1]

    # remove quotation marks
    summary = summary.replace('"', "")

    # sometimes summary comes with a rationale, e.g.
    # Summary: [involve people and places.]
    # Rationale: [Many of the phrases involve someone saying something, or someone being somewhere. There are also references to family members, such as "father's getting" and "grandfather caught a"]
    if ". " in summary:
        summary_clean = summary[: summary.index(". ")].strip()
        summary_rationale = summary[summary.index(". ") + 1 :].strip()
    else:
        summary_clean = summary
        summary_rationale = ""

    return summary_clean, summary_rationale


if __name__ == "__main__":
    summary_clean, summary_rationale = clean_summary(
        "relate to some form of science or research. Many of the phrases refer to specific scientific fields such as actuarial science, transracial adoption, and cognitive ability. Other phrases refer to the effects of certain phenomena, such as prozac and epilepsy"
    )
    # print('Clean:', repr(summary_clean))
    # print('Rationale:', repr(summary_rationale))
    llm = get_llm(checkpoint="text-davinci-003")

    summaries, summary_rationales = summarize_ngrams(
        llm, ["cat", "dog", "bird", "elephant", "cheetah"]
    )
    print("Summaries:", summaries)
    print("Rationales:", summary_rationales)

Functions

def clean_summary(summary: str)

Expand source code

def clean_summary(summary: str):
    summary = summary.strip().lower()

    # keep removing unnecessary prefixes
    modified_str = True
    while modified_str:
        modified_str = False
        for k in [
            "that",
            "they",
            "are",
            "all",
            "contain",
            "the",
            "use of",
            "related to",
            "involve",
            "some form of",
            "some kind of",
            "the use of",
            "describe",
            "refer to",
            "words",
            "word",
            "used to",
            "relate to",
        ]:
            if summary.startswith(k):
                summary = summary[len(k) :].strip()
                modified_str = True

    # remove unnecessary suffix
    if summary.endswith("."):
        summary = summary[:-1]

    # remove quotation marks
    summary = summary.replace('"', "")

    # sometimes summary comes with a rationale, e.g.
    # Summary: [involve people and places.]
    # Rationale: [Many of the phrases involve someone saying something, or someone being somewhere. There are also references to family members, such as "father's getting" and "grandfather caught a"]
    if ". " in summary:
        summary_clean = summary[: summary.index(". ")].strip()
        summary_rationale = summary[summary.index(". ") + 1 :].strip()
    else:
        summary_clean = summary
        summary_rationale = ""

    return summary_clean, summary_rationale

def summarize_ngrams(llm: Callable[[str], str], ngrams_list: List[str], num_summaries: int = 2, prefix_str='Here is a list of phrases:', suffix_str='What is a common theme among these phrases?\nThe common theme among these phrases is', num_top_ngrams_to_use: int = 30, num_top_ngrams_to_consider: int = 50, seed: int = 0) ‑> Tuple[List[str], List[str]]

Refine a keyphrase by making a call to the llm

Params

llm: Callable[[str], str] The llm to use ngrams_list: List[str] The list of ngrams to summarize num_summaries: int The number of summaries to generate prefix_str: str The prefix of the prompt string to use for the llm summarization suffix_str: str The suffix of the prompt string to use for the llm summarization num_top_ngrams_to_use: int The number of top ngrams to select num_top_ngrams_to_consider: int The number of top ngrams to consider selecting from seed: int The seed to use for the random number generator

Returns

summaries : List[str]: The list of summaries
summary_rationales : List[str]: The list of summary rationales (when available)

Expand source code

def summarize_ngrams(
    llm: Callable[[str], str],
    ngrams_list: List[str],
    num_summaries: int = 2,
    prefix_str="Here is a list of phrases:",
    suffix_str="What is a common theme among these phrases?\nThe common theme among these phrases is",
    num_top_ngrams_to_use: int = 30,
    num_top_ngrams_to_consider: int = 50,
    seed: int = 0,
) -> Tuple[List[str], List[str]]:
    """Refine a keyphrase by making a call to the llm

    Params
    ------
    llm: Callable[[str], str]
        The llm to use
    ngrams_list: List[str]
        The list of ngrams to summarize
    num_summaries: int
        The number of summaries to generate
    prefix_str: str
        The prefix of the prompt string to use for the llm summarization
    suffix_str: str
        The suffix of the prompt string to use for the llm summarization
    num_top_ngrams_to_use: int
        The number of top ngrams to select
    num_top_ngrams_to_consider: int
        The number of top ngrams to consider selecting from
    seed: int
        The seed to use for the random number generator

    Returns
    -------
    summaries: List[str]
        The list of summaries
    summary_rationales: List[str]
        The list of summary rationales (when available)
    """
    rng = np.random.default_rng(seed)

    summaries = []
    summary_rationales = []
    for i in range(num_summaries):
        # randomly sample num_top_ngrams (preserving ordering)
        n_to_consider = min(num_top_ngrams_to_consider, len(ngrams_list))
        n_to_use = min(num_top_ngrams_to_use, n_to_consider)
        idxs = np.sort(rng.choice(n_to_consider, size=n_to_use, replace=False))
        bullet_list_ngrams = "- " + "\n- ".join(np.array(ngrams_list)[idxs])
        prompt = prefix_str + "\n\n" + bullet_list_ngrams + "\n\n" + suffix_str
        if i == 0:
            print("First prompt")
            print(prompt)
        summary = llm(prompt)

        # clean up summary
        summary, summary_rationale = clean_summary(summary)
        summaries.append(summary)
        summary_rationales.append(summary_rationale)

    # remove replicates
    idxs_replicate = [False]
    summaries_running = {summaries[0]}
    for i in range(1, len(summaries)):
        if summaries[i] in summaries_running:
            idxs_replicate.append(True)
        else:
            idxs_replicate.append(False)
        summaries_running.add(summaries[i])
    summaries = [s for i, s in enumerate(summaries) if not idxs_replicate[i]]
    summary_rationales = [
        s for i, s in enumerate(summary_rationales) if not idxs_replicate[i]
    ]

    return summaries, summary_rationales