Module imodelsx.util
Expand source code
import logging
from typing import List
from tqdm import tqdm
from transformers import pipeline
import datasets
import numpy as np
from collections import Counter
def generate_ngrams_list(
sentence: str,
ngrams: int,
tokenizer_ngrams=None,
all_ngrams=False,
parsing: str = '',
nlp_chunks=None,
pad_starting_ngrams=False,
pad_ending_ngrams=False,
min_frequency=1,
prune_stopwords=False,
):
"""Get list of ngrams from sentence using a tokenizer
Params
------
ngrams: int
What order of ngrams to use (1 for unigrams, 2 for bigrams, ...)
all_ngrams: bool
whether to include all n-grams up to n or just n
pad_starting_ngrams: bool
if all_ngrams=False, then pad starting ngrams with shorter length ngrams
so that length of ngrams_list is the same as the initial sequence
e.g. for ngrams=3 ["the", "the quick", "the quick brown", "quick brown fox", "brown fox jumps", ...]
pad_ending_ngrams: bool
min_frequency: int
minimum frequency to be considered for the ngrams_list
"""
seqs = []
if tokenizer_ngrams is None:
def tokenizer_ngrams(x): return x.split()
# unigrams
unigrams_list = [str(x) for x in tokenizer_ngrams(sentence)]
if prune_stopwords:
unigrams_list = _prune_stopwords(unigrams_list)
if ngrams == 1:
seqs += unigrams_list
# all ngrams in loop
else:
if all_ngrams:
ngram_lengths = range(1, ngrams + 1)
# seqs = [str(x) for x in simple_tokenizer(sentence)] # precompute length 1
else:
ngram_lengths = range(ngrams, ngrams + 1)
for ngram_length in ngram_lengths:
for idx_starting in range(0, len(unigrams_list) + 1 - ngram_length):
idx_ending = idx_starting + ngram_length
seq = ' '.join(unigrams_list[idx_starting: idx_ending]).strip()
# seq = ''.join([t.text + ' ' #t.whitespace_
# for t in unigrams_list[idx_starting: idx_ending]]).strip() # convert the tokens back to text
if len(seq) > 0 and not seq.isspace(): # str is not just whitespace
seqs.append(seq)
# add noun_chunks which at least have a space in them
if parsing == 'noun_chunks':
doc = nlp_chunks(sentence)
seqs += [
chunk.text for chunk in doc.noun_chunks
if ' ' in chunk.text
]
if pad_starting_ngrams:
assert all_ngrams is False, "pad_starting_ngrams only works when all_ngrams=False"
seqs_init = [' '.join(unigrams_list[:ngram_length])
for ngram_length in range(1, ngrams)]
seqs = seqs_init + seqs
if pad_ending_ngrams:
assert all_ngrams is False, "pad_ending_ngrams only works when all_ngrams=False"
seqs_end = [' '.join(unigrams_list[-ngram_length:])
for ngram_length in range(1, ngrams)][::-1]
seqs = seqs + seqs_end
freqs = Counter(seqs)
seqs = [seq for seq, freq in freqs.items() if freq >= min_frequency]
return seqs
def get_embs_llm(X: List[str], checkpoint: str):
"""Return embeddings from HF model given checkpoint name
(Fixed-size embedding by averaging over seq_len)
"""
pipe = pipeline(
"feature-extraction",
model=checkpoint,
truncation=True,
device=0
)
def get_emb(x):
return {'emb': pipe(x['text'])}
text = datasets.Dataset.from_dict({'text': X})
out_list = text.map(get_emb)['emb']
# out_list is (batch_size, 1, (seq_len + 2), 768)
# convert to np array by averaging over len (can't just convert the since seq lens vary)
num_examples = len(out_list)
dim_size = len(out_list[0][0][0])
embs = np.zeros((num_examples, dim_size))
logging.info('extract embs HF...')
for i in tqdm(range(num_examples)):
embs[i] = np.mean(out_list[i], axis=1) # avg over seq_len dim
return embs
def get_spacy_tokenizer(convert_output=True, convert_lower=True):
from spacy.lang.en import English
nlp = English()
if convert_output:
class LLMTreeTokenizer:
def __init__(self):
self.tok = nlp
# written kind of weirdly to optimize the speed of the tokenizer
if convert_lower:
def __call__(self, s):
s = s.lower()
return [str(x) for x in self.tok(s)]
else:
def __call__(self, s):
return [str(x) for x in self.tok(s)]
return LLMTreeTokenizer()
else:
return nlp
STOPWORDS = {
'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when',
'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
'some', 'such', 'nor', 'only', 'own', 'so', 'than', 'too', 'very',
'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't",
'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
}
def _prune_stopwords(words_list, min_length=3):
return [
word for word in words_list
if word.lower() not in STOPWORDS
and len(word) >= min_length
]
Functions
def generate_ngrams_list(sentence: str, ngrams: int, tokenizer_ngrams=None, all_ngrams=False, parsing: str = '', nlp_chunks=None, pad_starting_ngrams=False, pad_ending_ngrams=False, min_frequency=1, prune_stopwords=False)
-
Get list of ngrams from sentence using a tokenizer
Params
ngrams: int What order of ngrams to use (1 for unigrams, 2 for bigrams, …) all_ngrams: bool whether to include all n-grams up to n or just n pad_starting_ngrams: bool if all_ngrams=False, then pad starting ngrams with shorter length ngrams so that length of ngrams_list is the same as the initial sequence e.g. for ngrams=3 ["the", "the quick", "the quick brown", "quick brown fox", "brown fox jumps", …] pad_ending_ngrams: bool min_frequency: int minimum frequency to be considered for the ngrams_list
Expand source code
def generate_ngrams_list( sentence: str, ngrams: int, tokenizer_ngrams=None, all_ngrams=False, parsing: str = '', nlp_chunks=None, pad_starting_ngrams=False, pad_ending_ngrams=False, min_frequency=1, prune_stopwords=False, ): """Get list of ngrams from sentence using a tokenizer Params ------ ngrams: int What order of ngrams to use (1 for unigrams, 2 for bigrams, ...) all_ngrams: bool whether to include all n-grams up to n or just n pad_starting_ngrams: bool if all_ngrams=False, then pad starting ngrams with shorter length ngrams so that length of ngrams_list is the same as the initial sequence e.g. for ngrams=3 ["the", "the quick", "the quick brown", "quick brown fox", "brown fox jumps", ...] pad_ending_ngrams: bool min_frequency: int minimum frequency to be considered for the ngrams_list """ seqs = [] if tokenizer_ngrams is None: def tokenizer_ngrams(x): return x.split() # unigrams unigrams_list = [str(x) for x in tokenizer_ngrams(sentence)] if prune_stopwords: unigrams_list = _prune_stopwords(unigrams_list) if ngrams == 1: seqs += unigrams_list # all ngrams in loop else: if all_ngrams: ngram_lengths = range(1, ngrams + 1) # seqs = [str(x) for x in simple_tokenizer(sentence)] # precompute length 1 else: ngram_lengths = range(ngrams, ngrams + 1) for ngram_length in ngram_lengths: for idx_starting in range(0, len(unigrams_list) + 1 - ngram_length): idx_ending = idx_starting + ngram_length seq = ' '.join(unigrams_list[idx_starting: idx_ending]).strip() # seq = ''.join([t.text + ' ' #t.whitespace_ # for t in unigrams_list[idx_starting: idx_ending]]).strip() # convert the tokens back to text if len(seq) > 0 and not seq.isspace(): # str is not just whitespace seqs.append(seq) # add noun_chunks which at least have a space in them if parsing == 'noun_chunks': doc = nlp_chunks(sentence) seqs += [ chunk.text for chunk in doc.noun_chunks if ' ' in chunk.text ] if pad_starting_ngrams: assert all_ngrams is False, "pad_starting_ngrams only works when all_ngrams=False" seqs_init = [' '.join(unigrams_list[:ngram_length]) for ngram_length in range(1, ngrams)] seqs = seqs_init + seqs if pad_ending_ngrams: assert all_ngrams is False, "pad_ending_ngrams only works when all_ngrams=False" seqs_end = [' '.join(unigrams_list[-ngram_length:]) for ngram_length in range(1, ngrams)][::-1] seqs = seqs + seqs_end freqs = Counter(seqs) seqs = [seq for seq, freq in freqs.items() if freq >= min_frequency] return seqs
def get_embs_llm(X: List[str], checkpoint: str)
-
Return embeddings from HF model given checkpoint name (Fixed-size embedding by averaging over seq_len)
Expand source code
def get_embs_llm(X: List[str], checkpoint: str): """Return embeddings from HF model given checkpoint name (Fixed-size embedding by averaging over seq_len) """ pipe = pipeline( "feature-extraction", model=checkpoint, truncation=True, device=0 ) def get_emb(x): return {'emb': pipe(x['text'])} text = datasets.Dataset.from_dict({'text': X}) out_list = text.map(get_emb)['emb'] # out_list is (batch_size, 1, (seq_len + 2), 768) # convert to np array by averaging over len (can't just convert the since seq lens vary) num_examples = len(out_list) dim_size = len(out_list[0][0][0]) embs = np.zeros((num_examples, dim_size)) logging.info('extract embs HF...') for i in tqdm(range(num_examples)): embs[i] = np.mean(out_list[i], axis=1) # avg over seq_len dim return embs
def get_spacy_tokenizer(convert_output=True, convert_lower=True)
-
Expand source code
def get_spacy_tokenizer(convert_output=True, convert_lower=True): from spacy.lang.en import English nlp = English() if convert_output: class LLMTreeTokenizer: def __init__(self): self.tok = nlp # written kind of weirdly to optimize the speed of the tokenizer if convert_lower: def __call__(self, s): s = s.lower() return [str(x) for x in self.tok(s)] else: def __call__(self, s): return [str(x) for x in self.tok(s)] return LLMTreeTokenizer() else: return nlp