Module imodelsx.auglinear.embed
Functions
def embed_and_sum_function(example,
model,
ngrams: int,
tokenizer_embeddings,
tokenizer_ngrams,
checkpoint: str,
dataset_key_text: str = None,
layer: str = 'last_hidden_state',
padding: str = True,
batch_size: int = 8,
parsing: str = '',
nlp_chunks=None,
all_ngrams: bool = False,
fit_with_ngram_decomposition: bool = True,
embedding_prefix: str = 'Represent the short phrase for sentiment classification: ',
embedding_suffix: str = '',
embedding_strategy: str = 'mean',
sum_embeddings=True,
prune_stopwords=False)-
Expand source code
def embed_and_sum_function( example, model, ngrams: int, tokenizer_embeddings, tokenizer_ngrams, checkpoint: str, dataset_key_text: str = None, layer: str = "last_hidden_state", padding: str = True, batch_size: int = 8, parsing: str = "", nlp_chunks=None, all_ngrams: bool = False, fit_with_ngram_decomposition: bool = True, embedding_prefix: str = "Represent the short phrase for sentiment classification: ", embedding_suffix: str = "", embedding_strategy: str = 'mean', sum_embeddings=True, prune_stopwords=False, ): """Get summed embeddings for a single example Params ------ ngrams: int What order of ngrams to use (1 for unigrams, 2 for bigrams, ...) dataset_key_text: str that identifies where data examples are stored, e.g. "sentence" for sst2 tokenizer_embeddings tokenizing for the embedding model tokenizer_ngrams tokenizing the ngrams (word-based tokenization is more interpretable) layer: str which layer to extract embeddings from batch_size: int batch size for simultaneously running ngrams (for a single example) parsing: str whether to use parsing rather than extracting all ngrams nlp_chunks if parsing is not empty string, a parser that extracts specific ngrams fit_with_ngram_decomposition whether to fit the model with ngram decomposition (if not just use the standard sentence) embedding_prefix if checkpoint is an instructor/autoregressive model, prepend this prompt embedding_suffix if checkpoint is an autoregressive model, append this prompt embedding_strategy: str 'mean': compute mean over ngram tokens 'next_token_distr': use next token distribution as an embedding (requires AutoModelForCausalLM checkpoint) all_ngrams: bool whether to include all ngrams of lower order """ # convert to list of strings seqs = _get_seqs( example, dataset_key_text, fit_with_ngram_decomposition, ngrams, tokenizer_ngrams, parsing, nlp_chunks, all_ngrams, prune_stopwords) if embedding_strategy == 'next_token_distr': seqs = [f'{embedding_prefix}{x_i}{embedding_suffix}' for x_i in seqs] if not checkpoint.startswith("hkunlp/instructor") and ( not hasattr(tokenizer_embeddings, "pad_token") or tokenizer_embeddings.pad_token is None ): tokenizer_embeddings.pad_token = tokenizer_embeddings.eos_token # compute embeddings embs = [] if checkpoint.startswith("hkunlp/instructor"): embs = model.encode( [[embedding_prefix, x_i] for x_i in seqs], batch_size=batch_size ) else: tokens = tokenizer_embeddings( seqs, padding=padding, truncation=True, return_tensors="pt" ) ds = Dataset.from_dict(tokens).with_format("torch") for batch in DataLoader(ds, batch_size=batch_size, shuffle=False): batch = {k: v.to(model.device) for k, v in batch.items()} with torch.no_grad(): output = model(**batch) torch.cuda.empty_cache() if embedding_strategy == "next_token_distr": emb = _next_token_distr_with_mask( output["logits"], batch["attention_mask"] ) else: if layer == "pooler_output": emb = output["pooler_output"] elif layer == "last_hidden_state_mean" or layer == "last_hidden_state": # extract (batch_size, seq_len, hidden_size) emb = output["last_hidden_state"] # convert to (batch_size, hidden_size) emb = _mean_with_mask(emb, batch["attention_mask"]) elif "hidden_states" in output.keys(): # extract (layer x (batch_size, seq_len, hidden_size)) h = output["hidden_states"] # convert to (batch_size, seq_len, hidden_size) emb = h[0] # convert to (batch_size, hidden_size) emb = _mean_with_mask(emb, batch["attention_mask"]) else: raise Exception(f"keys: {output.keys()}") embs.append(emb.cpu().detach().numpy()) embs = np.concatenate(embs) # else: # raise Exception(f"Unknown model checkpoint {checkpoint}") # sum over the embeddings if sum_embeddings: embs = embs.sum(axis=0).reshape(1, -1) if len(seqs) == 0: embs *= 0 return {"embs": embs, "seq_len": len(seqs)}
Get summed embeddings for a single example
Params
ngrams: int What order of ngrams to use (1 for unigrams, 2 for bigrams, …) dataset_key_text: str that identifies where data examples are stored, e.g. "sentence" for sst2 tokenizer_embeddings tokenizing for the embedding model tokenizer_ngrams tokenizing the ngrams (word-based tokenization is more interpretable) layer: str which layer to extract embeddings from batch_size: int batch size for simultaneously running ngrams (for a single example) parsing: str whether to use parsing rather than extracting all ngrams nlp_chunks if parsing is not empty string, a parser that extracts specific ngrams fit_with_ngram_decomposition whether to fit the model with ngram decomposition (if not just use the standard sentence) embedding_prefix if checkpoint is an instructor/autoregressive model, prepend this prompt embedding_suffix if checkpoint is an autoregressive model, append this prompt embedding_strategy: str 'mean': compute mean over ngram tokens 'next_token_distr': use next token distribution as an embedding (requires AutoModelForCausalLM checkpoint) all_ngrams: bool whether to include all ngrams of lower order