Module imodelsx.sasc.m2_summarize
Functions
def clean_summary(summary: str)
-
Expand source code
def clean_summary(summary: str): summary = summary.strip().lower() # keep removing unnecessary prefixes modified_str = True while modified_str: modified_str = False for k in [ "that", "they", "are", "all", "contain", "the", "use of", "related to", "involve", "some form of", "some kind of", "the use of", "describe", "refer to", "words", "word", "used to", "relate to", ]: if summary.startswith(k): summary = summary[len(k) :].strip() modified_str = True # remove unnecessary suffix if summary.endswith("."): summary = summary[:-1] # remove quotation marks summary = summary.replace('"', "") # sometimes summary comes with a rationale, e.g. # Summary: [involve people and places.] # Rationale: [Many of the phrases involve someone saying something, or someone being somewhere. There are also references to family members, such as "father's getting" and "grandfather caught a"] if ". " in summary: summary_clean = summary[: summary.index(". ")].strip() summary_rationale = summary[summary.index(". ") + 1 :].strip() else: summary_clean = summary summary_rationale = "" return summary_clean, summary_rationale
def summarize_ngrams(llm: Callable[[str], str],
ngrams_list: List[str],
num_summaries: int = 2,
prefix_str='Here is a list of phrases:',
suffix_str='What is a common theme among these phrases?\nThe common theme among these phrases is',
num_top_ngrams_to_use: int = 30,
num_top_ngrams_to_consider: int = 50,
seed: int = 0) ‑> Tuple[List[str], List[str]]-
Expand source code
def summarize_ngrams( llm: Callable[[str], str], ngrams_list: List[str], num_summaries: int = 2, prefix_str="Here is a list of phrases:", suffix_str="What is a common theme among these phrases?\nThe common theme among these phrases is", num_top_ngrams_to_use: int = 30, num_top_ngrams_to_consider: int = 50, seed: int = 0, ) -> Tuple[List[str], List[str]]: """Refine a keyphrase by making a call to the llm Params ------ llm: Callable[[str], str] The llm to use ngrams_list: List[str] The list of ngrams to summarize num_summaries: int The number of summaries to generate prefix_str: str The prefix of the prompt string to use for the llm summarization suffix_str: str The suffix of the prompt string to use for the llm summarization num_top_ngrams_to_use: int The number of top ngrams to select num_top_ngrams_to_consider: int The number of top ngrams to consider selecting from seed: int The seed to use for the random number generator Returns ------- summaries: List[str] The list of summaries summary_rationales: List[str] The list of summary rationales (when available) """ rng = np.random.default_rng(seed) summaries = [] summary_rationales = [] for i in range(num_summaries): # randomly sample num_top_ngrams (preserving ordering) n_to_consider = min(num_top_ngrams_to_consider, len(ngrams_list)) n_to_use = min(num_top_ngrams_to_use, n_to_consider) idxs = np.sort(rng.choice(n_to_consider, size=n_to_use, replace=False)) bullet_list_ngrams = "- " + "\n- ".join(np.array(ngrams_list)[idxs]) prompt = prefix_str + "\n\n" + bullet_list_ngrams + "\n\n" + suffix_str if i == 0: print("First prompt") print(prompt) summary = llm(prompt) # clean up summary summary, summary_rationale = clean_summary(summary) summaries.append(summary) summary_rationales.append(summary_rationale) # remove replicates idxs_replicate = [False] summaries_running = {summaries[0]} for i in range(1, len(summaries)): if summaries[i] in summaries_running: idxs_replicate.append(True) else: idxs_replicate.append(False) summaries_running.add(summaries[i]) summaries = [s for i, s in enumerate(summaries) if not idxs_replicate[i]] summary_rationales = [ s for i, s in enumerate(summary_rationales) if not idxs_replicate[i] ] return summaries, summary_rationales
Refine a keyphrase by making a call to the llm
Params
llm: Callable[[str], str] The llm to use ngrams_list: List[str] The list of ngrams to summarize num_summaries: int The number of summaries to generate prefix_str: str The prefix of the prompt string to use for the llm summarization suffix_str: str The suffix of the prompt string to use for the llm summarization num_top_ngrams_to_use: int The number of top ngrams to select num_top_ngrams_to_consider: int The number of top ngrams to consider selecting from seed: int The seed to use for the random number generator
Returns
summaries
:List[str]
- The list of summaries
summary_rationales
:List[str]
- The list of summary rationales (when available)