Module imodelsx.augtree.data

Expand source code
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import imodelsx.data
import imodelsx.augtree.utils


def convert_text_data_to_counts_array(
    X_train, X_test, ngrams=2, all_ngrams=True,
    tokenizer=None,
):
    if tokenizer == None:
        tokenizer = imodelsx.augtree.utils.get_spacy_tokenizer()

    if all_ngrams:
        ngram_range = (1, ngrams)
    else:
        ngram_range = (ngrams, ngrams)

    v = CountVectorizer(
        ngram_range=ngram_range,
        tokenizer=tokenizer,
        lowercase=True,
        token_pattern=None,
    )
    X_train = v.fit_transform(X_train)
    X_test = v.transform(X_test)
    feature_names = v.get_feature_names_out().tolist()
    return X_train, X_test, feature_names


def get_all_data(args):
    X_train, X_test, y_train, y_test = imodelsx.data.load_huggingface_dataset(
        dataset_name=args.dataset_name, subsample_frac=args.subsample_frac, return_lists=True)
    X_train, X_test, feature_names = \
        convert_text_data_to_counts_array(X_train, X_test)
    X_train, X_cv, y_train, y_cv = train_test_split(
        X_train, y_train, test_size=0.33, random_state=args.seed)
    return X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names

Functions

def convert_text_data_to_counts_array(X_train, X_test, ngrams=2, all_ngrams=True, tokenizer=None)
Expand source code
def convert_text_data_to_counts_array(
    X_train, X_test, ngrams=2, all_ngrams=True,
    tokenizer=None,
):
    if tokenizer == None:
        tokenizer = imodelsx.augtree.utils.get_spacy_tokenizer()

    if all_ngrams:
        ngram_range = (1, ngrams)
    else:
        ngram_range = (ngrams, ngrams)

    v = CountVectorizer(
        ngram_range=ngram_range,
        tokenizer=tokenizer,
        lowercase=True,
        token_pattern=None,
    )
    X_train = v.fit_transform(X_train)
    X_test = v.transform(X_test)
    feature_names = v.get_feature_names_out().tolist()
    return X_train, X_test, feature_names
def get_all_data(args)
Expand source code
def get_all_data(args):
    X_train, X_test, y_train, y_test = imodelsx.data.load_huggingface_dataset(
        dataset_name=args.dataset_name, subsample_frac=args.subsample_frac, return_lists=True)
    X_train, X_test, feature_names = \
        convert_text_data_to_counts_array(X_train, X_test)
    X_train, X_cv, y_train, y_cv = train_test_split(
        X_train, y_train, test_size=0.33, random_state=args.seed)
    return X_train, X_cv, X_test, y_train, y_cv, y_test, feature_names