Module imodelsx.sasc.m3_generate
Functions
def generate_synthetic_strs(llm: Callable[[str], str],
explanation_str: str,
num_synthetic_strs: int = 20,
template_num: int = 0,
verbose=True) ‑> Tuple[List[str], List[str]]-
Expand source code
def generate_synthetic_strs( llm: Callable[[str], str], explanation_str: str, num_synthetic_strs: int = 20, template_num: int = 0, verbose=True, ) -> Tuple[List[str], List[str]]: """Generate text_added and text_removed via call to an LLM. Params ------ llm: Callable[[str], str] The llm to use flan-t5-xxl/opt-iml-max-30b can only generate one sentence before stopping EleutherAI/gpt-neox-20b can generate multiple sentences, but they are not faithful to the concept explanation_str: str The explanation string to use num_synthetic_strs: int The number of synthetic strings to generate template_num: int The prompt template number to use Returns ------- strs_added: List[str] The list of synthetic strings with the explanation scores added strs_removed: List[str] The list of synthetic strings with the explanation scores removed """ templates = [ """ Generate {num_synthetic_strs} sentences that {blank_or_do_not}contain the concept of "{concept}": 1. The""", """ Generate {num_synthetic_strs} phrases that are {blank_or_do_not}similar to the concept of "{concept}": 1.""", ] blank_or_do_not_templates = [ ["", "do not "], ["", "not "], ] template = templates[template_num] strs_added = [] strs_removed = [] for blank_or_do_not in blank_or_do_not_templates[template_num]: prompt = template.format( num_synthetic_strs=num_synthetic_strs, blank_or_do_not=blank_or_do_not, concept=explanation_str, ) # note: this works works with openai model # but tends to stop after generating just one text with non-openai synthetic_text_numbered_str = llm(prompt, max_new_tokens=400, do_sample=True) if verbose: print("\n\n---------------\n") print(prompt) print("\n\n---------------\n") print(synthetic_text_numbered_str) print("\n\n---------------\n") # split the string s on any number followed by period like 1. or 2. synthetic_strs_split = re.split(r"\d.", synthetic_text_numbered_str) synthetic_strs_split = [s.strip() for s in synthetic_strs_split if s.strip()] synthetic_strs = [] for i in range(len(synthetic_strs_split)): s = synthetic_strs_split[i] if s.startswith("."): s = s[1:] synthetic_strs.append(s.strip()) synthetic_strs = [s for s in synthetic_strs if len(s) > 2] if verbose: print("synthetic_strs=", synthetic_strs) # ks = list(set(ks)) # remove duplicates # ks = [k.lower() for k in ks if len(k) > 2] # lowercase & len > 2 # return ks # synthetic_str = synthetic_str.strip() # .... for s in synthetic_strs: if blank_or_do_not == "": strs_added.append(s) else: strs_removed.append(s) return strs_added, strs_removed
Generate text_added and text_removed via call to an LLM.
Params
llm: Callable[[str], str] The llm to use flan-t5-xxl/opt-iml-max-30b can only generate one sentence before stopping EleutherAI/gpt-neox-20b can generate multiple sentences, but they are not faithful to the concept explanation_str: str The explanation string to use num_synthetic_strs: int The number of synthetic strings to generate template_num: int The prompt template number to use
Returns
strs_added
:List[str]
- The list of synthetic strings with the explanation scores added
strs_removed
:List[str]
- The list of synthetic strings with the explanation scores removed