Module imodelsx.util

Functions

def generate_ngrams_list(sentence: str,
ngrams: int,
tokenizer_ngrams=None,
all_ngrams=False,
parsing: str = '',
nlp_chunks=None,
pad_starting_ngrams=False,
pad_ending_ngrams=False,
min_frequency=1,
prune_stopwords=False)
Expand source code
def generate_ngrams_list(
    sentence: str,
    ngrams: int,
    tokenizer_ngrams=None,
    all_ngrams=False,
    parsing: str = '',
    nlp_chunks=None,
    pad_starting_ngrams=False,
    pad_ending_ngrams=False,
    min_frequency=1,
    prune_stopwords=False,
):
    """Get list of ngrams from sentence using a tokenizer

    Params
    ------
    ngrams: int
        What order of ngrams to use (1 for unigrams, 2 for bigrams, ...)
    all_ngrams: bool
        whether to include all n-grams up to n or just n
    pad_starting_ngrams: bool
        if all_ngrams=False, then pad starting ngrams with shorter length ngrams
        so that length of ngrams_list is the same as the initial sequence
        e.g. for ngrams=3 ["the", "the quick", "the quick brown", "quick brown fox", "brown fox jumps", ...]
    pad_ending_ngrams: bool
    min_frequency: int
        minimum frequency to be considered for the ngrams_list
    """

    seqs = []

    if tokenizer_ngrams is None:
        def tokenizer_ngrams(x): return x.split()

    # unigrams
    unigrams_list = [str(x) for x in tokenizer_ngrams(sentence)]
    if prune_stopwords:
        unigrams_list = _prune_stopwords(unigrams_list)

    if ngrams == 1:
        seqs += unigrams_list

    # all ngrams in loop
    else:
        if all_ngrams:
            ngram_lengths = range(1, ngrams + 1)
    #         seqs = [str(x) for x in simple_tokenizer(sentence)] # precompute length 1
        else:
            ngram_lengths = range(ngrams, ngrams + 1)

        for ngram_length in ngram_lengths:
            for idx_starting in range(0, len(unigrams_list) + 1 - ngram_length):
                idx_ending = idx_starting + ngram_length
                seq = ' '.join(unigrams_list[idx_starting: idx_ending]).strip()
                # seq = ''.join([t.text + ' ' #t.whitespace_
                #    for t in unigrams_list[idx_starting: idx_ending]]).strip()  # convert the tokens back to text
                if len(seq) > 0 and not seq.isspace():  # str is not just whitespace
                    seqs.append(seq)

    # add noun_chunks which at least have a space in them
    if parsing == 'noun_chunks':
        doc = nlp_chunks(sentence)
        seqs += [
            chunk.text for chunk in doc.noun_chunks
            if ' ' in chunk.text
        ]

    if pad_starting_ngrams:
        assert all_ngrams is False, "pad_starting_ngrams only works when all_ngrams=False"
        seqs_init = [' '.join(unigrams_list[:ngram_length])
                     for ngram_length in range(1, ngrams)]
        seqs = seqs_init + seqs

    if pad_ending_ngrams:
        assert all_ngrams is False, "pad_ending_ngrams only works when all_ngrams=False"
        seqs_end = [' '.join(unigrams_list[-ngram_length:])
                    for ngram_length in range(1, ngrams)][::-1]
        seqs = seqs + seqs_end

    freqs = Counter(seqs)

    seqs = [seq for seq, freq in freqs.items() if freq >= min_frequency]

    return seqs

Get list of ngrams from sentence using a tokenizer

Params

ngrams: int What order of ngrams to use (1 for unigrams, 2 for bigrams, …) all_ngrams: bool whether to include all n-grams up to n or just n pad_starting_ngrams: bool if all_ngrams=False, then pad starting ngrams with shorter length ngrams so that length of ngrams_list is the same as the initial sequence e.g. for ngrams=3 ["the", "the quick", "the quick brown", "quick brown fox", "brown fox jumps", …] pad_ending_ngrams: bool min_frequency: int minimum frequency to be considered for the ngrams_list

def get_embs_llm(X: List[str], checkpoint: str)
Expand source code
def get_embs_llm(X: List[str], checkpoint: str):
    """Return embeddings from HF model given checkpoint name
    (Fixed-size embedding by averaging over seq_len)
    """
    pipe = pipeline(
        "feature-extraction",
        model=checkpoint,
        truncation=True,
        device=0
    )

    def get_emb(x):
        return {'emb': pipe(x['text'])}
    text = datasets.Dataset.from_dict({'text': X})
    out_list = text.map(get_emb)['emb']
    # out_list is (batch_size, 1, (seq_len + 2), 768)

    # convert to np array by averaging over len (can't just convert the since seq lens vary)
    num_examples = len(out_list)
    dim_size = len(out_list[0][0][0])
    embs = np.zeros((num_examples, dim_size))
    logging.info('extract embs HF...')
    for i in tqdm(range(num_examples)):
        embs[i] = np.mean(out_list[i], axis=1)  # avg over seq_len dim
    return embs

Return embeddings from HF model given checkpoint name (Fixed-size embedding by averaging over seq_len)

def get_spacy_tokenizer(convert_output=True, convert_lower=True)
Expand source code
def get_spacy_tokenizer(convert_output=True, convert_lower=True):
    from spacy.lang.en import English
    nlp = English()
    if convert_output:
        class LLMTreeTokenizer:
            def __init__(self):
                self.tok = nlp

            # written kind of weirdly to optimize the speed of the tokenizer
            if convert_lower:
                def __call__(self, s):
                    s = s.lower()
                    return [str(x) for x in self.tok(s)]
            else:
                def __call__(self, s):
                    return [str(x) for x in self.tok(s)]
        return LLMTreeTokenizer()
    else:
        return nlp