Module imodelsx.sasc.m2_summarize

Functions

def clean_summary(summary: str)
Expand source code
def clean_summary(summary: str):
    summary = summary.strip().lower()

    # keep removing unnecessary prefixes
    modified_str = True
    while modified_str:
        modified_str = False
        for k in [
            "that",
            "they",
            "are",
            "all",
            "contain",
            "the",
            "use of",
            "related to",
            "involve",
            "some form of",
            "some kind of",
            "the use of",
            "describe",
            "refer to",
            "words",
            "word",
            "used to",
            "relate to",
        ]:
            if summary.startswith(k):
                summary = summary[len(k) :].strip()
                modified_str = True

    # remove unnecessary suffix
    if summary.endswith("."):
        summary = summary[:-1]

    # remove quotation marks
    summary = summary.replace('"', "")

    # sometimes summary comes with a rationale, e.g.
    # Summary: [involve people and places.]
    # Rationale: [Many of the phrases involve someone saying something, or someone being somewhere. There are also references to family members, such as "father's getting" and "grandfather caught a"]
    if ". " in summary:
        summary_clean = summary[: summary.index(". ")].strip()
        summary_rationale = summary[summary.index(". ") + 1 :].strip()
    else:
        summary_clean = summary
        summary_rationale = ""

    return summary_clean, summary_rationale
def summarize_ngrams(llm: Callable[[str], str],
ngrams_list: List[str],
num_summaries: int = 2,
prefix_str='Here is a list of phrases:',
suffix_str='What is a common theme among these phrases?\nThe common theme among these phrases is',
num_top_ngrams_to_use: int = 30,
num_top_ngrams_to_consider: int = 50,
seed: int = 0) ‑> Tuple[List[str], List[str]]
Expand source code
def summarize_ngrams(
    llm: Callable[[str], str],
    ngrams_list: List[str],
    num_summaries: int = 2,
    prefix_str="Here is a list of phrases:",
    suffix_str="What is a common theme among these phrases?\nThe common theme among these phrases is",
    num_top_ngrams_to_use: int = 30,
    num_top_ngrams_to_consider: int = 50,
    seed: int = 0,
) -> Tuple[List[str], List[str]]:
    """Refine a keyphrase by making a call to the llm

    Params
    ------
    llm: Callable[[str], str]
        The llm to use
    ngrams_list: List[str]
        The list of ngrams to summarize
    num_summaries: int
        The number of summaries to generate
    prefix_str: str
        The prefix of the prompt string to use for the llm summarization
    suffix_str: str
        The suffix of the prompt string to use for the llm summarization
    num_top_ngrams_to_use: int
        The number of top ngrams to select
    num_top_ngrams_to_consider: int
        The number of top ngrams to consider selecting from
    seed: int
        The seed to use for the random number generator

    Returns
    -------
    summaries: List[str]
        The list of summaries
    summary_rationales: List[str]
        The list of summary rationales (when available)
    """
    rng = np.random.default_rng(seed)

    summaries = []
    summary_rationales = []
    for i in range(num_summaries):
        # randomly sample num_top_ngrams (preserving ordering)
        n_to_consider = min(num_top_ngrams_to_consider, len(ngrams_list))
        n_to_use = min(num_top_ngrams_to_use, n_to_consider)
        idxs = np.sort(rng.choice(n_to_consider, size=n_to_use, replace=False))
        bullet_list_ngrams = "- " + "\n- ".join(np.array(ngrams_list)[idxs])
        prompt = prefix_str + "\n\n" + bullet_list_ngrams + "\n\n" + suffix_str
        if i == 0:
            print("First prompt")
            print(prompt)
        summary = llm(prompt)

        # clean up summary
        summary, summary_rationale = clean_summary(summary)
        summaries.append(summary)
        summary_rationales.append(summary_rationale)

    # remove replicates
    idxs_replicate = [False]
    summaries_running = {summaries[0]}
    for i in range(1, len(summaries)):
        if summaries[i] in summaries_running:
            idxs_replicate.append(True)
        else:
            idxs_replicate.append(False)
        summaries_running.add(summaries[i])
    summaries = [s for i, s in enumerate(summaries) if not idxs_replicate[i]]
    summary_rationales = [
        s for i, s in enumerate(summary_rationales) if not idxs_replicate[i]
    ]

    return summaries, summary_rationales

Refine a keyphrase by making a call to the llm

Params

llm: Callable[[str], str] The llm to use ngrams_list: List[str] The list of ngrams to summarize num_summaries: int The number of summaries to generate prefix_str: str The prefix of the prompt string to use for the llm summarization suffix_str: str The suffix of the prompt string to use for the llm summarization num_top_ngrams_to_use: int The number of top ngrams to select num_top_ngrams_to_consider: int The number of top ngrams to consider selecting from seed: int The seed to use for the random number generator

Returns

summaries : List[str]
The list of summaries
summary_rationales : List[str]
The list of summary rationales (when available)