Module imodelsx.auglinear.auglinear

Simple scikit-learn interface for Aug-Linear.

Augmenting Interpretable Models with LLMs during Training Chandan Singh, Armin Askari, Rich Caruana, Jianfeng Gao https://arxiv.org/abs/2209.11799

Classes

class AugLinear (checkpoint: str = 'bert-base-uncased',
layer: str = 'last_hidden_state',
ngrams: int = 2,
all_ngrams: bool = False,
min_frequency: int = 1,
tokenizer_ngrams=None,
random_state=None,
normalize_embs=False,
cache_embs_dir: str = None,
fit_with_ngram_decomposition=True,
embedding_prefix='Represent the short phrase for sentiment classification: ',
embedding_suffix='',
embedding_ngram_strategy='mean',
zeroshot_class_dict: Dict[int, str] = None,
zeroshot_strategy: str = 'pos_class',
prune_stopwords: bool = False)
Expand source code
class AugLinear(BaseEstimator):
    def __init__(
        self,
        checkpoint: str = "bert-base-uncased",
        layer: str = "last_hidden_state",
        ngrams: int = 2,
        all_ngrams: bool = False,
        min_frequency: int = 1,
        tokenizer_ngrams=None,
        random_state=None,
        normalize_embs=False,
        cache_embs_dir: str = None,
        fit_with_ngram_decomposition=True,
        embedding_prefix="Represent the short phrase for sentiment classification: ",
        embedding_suffix="",
        embedding_ngram_strategy='mean',
        zeroshot_class_dict: Dict[int, str] = None,
        zeroshot_strategy: str = 'pos_class',
        prune_stopwords: bool = False,
    ):
        """AugLinear Class - use either AugLinearClassifier or AugLinearRegressor rather than initializing this class directly.

        Parameters
        ----------
        checkpoint: str
            Name of model checkpoint (i.e. to be fetch by huggingface)
        layer: str
            Name of layer to extract embeddings from
        ngrams
            Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc.
        all_ngrams
            Whether to use all order ngrams <= ngrams argument
        min_frequency
            minimum frequency of ngrams to be kept in the ngrams list.
        tokenizer_ngrams
            if None, defaults to spacy English tokenizer
        random_state
            random seed for fitting
        normalize_embs
            whether to normalize embeddings before fitting linear model
        cache_embs_dir: str = None,
            if not None, directory to save embeddings into
        fit_with_ngram_decomposition
            whether to fit to aug-linear style (using sum of embeddings of each ngram)
            if False, fits a typical model and uses ngram decomposition only for prediction / testing
            Usually, setting this to False will considerably impede performance
        embedding_prefix
            if checkpoint is an instructor/autoregressive model, prepend this prompt
        embedding_suffix
            if checkpoint is an autoregressive model, append this prompt
        embedding_ngram_strategy
            'mean': compute mean over ngram tokens
            'next_token_distr': use next token distribution as an embedding (requires AutoModelForCausalLM checkpoint)
        zeroshot_class_dict
            Maps class numbers to names of the class to use to compute the embedding
            Ex. {0: 'negative', 1: 'positive'}
        zeroshot_strategy
            'pos_class' or 'difference'
        prune_stopwords
            Whether to prune stopwords and ngrams with length < 3
        """
        self.checkpoint = checkpoint
        self.ngrams = ngrams
        if tokenizer_ngrams == None:
            from spacy.lang.en import English
            self.tokenizer_ngrams = English().tokenizer
        else:
            self.tokenizer_ngrams = tokenizer_ngrams
        self.layer = layer
        self.random_state = random_state
        self.all_ngrams = all_ngrams
        self.min_frequency = min_frequency
        self.normalize_embs = normalize_embs
        self.cache_embs_dir = cache_embs_dir
        self.fit_with_ngram_decomposition = fit_with_ngram_decomposition
        self.embedding_prefix = embedding_prefix
        self.embedding_suffix = embedding_suffix
        self.embedding_ngram_strategy = embedding_ngram_strategy
        self.zeroshot_class_dict = zeroshot_class_dict
        self.zeroshot_strategy = zeroshot_strategy
        self.prune_stopwords = prune_stopwords

    def fit(
        self,
        X: ArrayLike,
        y: ArrayLike,
        verbose=True,
        cache_linear_coefs: bool = True,
        batch_size: int = 8,
    ):
        """Extract embeddings then fit linear model

        Parameters
        ----------
        X: ArrayLike[str]
        y: ArrayLike[str]
        cache_linear_coefs
            Whether to compute and cache linear coefs into self.coefs_dict_
        batch_size, optional
            if not None, batch size to pass while calculating embeddings
        """

        # metadata
        if isinstance(self, ClassifierMixin):
            self.classes_ = unique_labels(y)
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # set up model
        if verbose:
            print("initializing model...")
        model, tokenizer_embeddings = self._get_model_and_tokenizer()

        # if zero-shot, then set linear and return
        if self.zeroshot_class_dict is not None:
            self._fit_zeroshot(model, tokenizer_embeddings, verbose=verbose)
            return self

        # get embs
        if verbose:
            print("calculating embeddings...")
        if self.cache_embs_dir is not None and os.path.exists(
            os.path.join(self.cache_embs_dir, "embs_train.pkl")
        ):
            embs = pkl.load(
                open(os.path.join(self.cache_embs_dir, "embs_train.pkl"), "rb")
            )
        else:
            embs = self._get_embs(
                X, model, tokenizer_embeddings, batch_size, summed=True)
            if self.cache_embs_dir is not None:
                os.makedirs(self.cache_embs_dir, exist_ok=True)
                pkl.dump(
                    embs,
                    open(os.path.join(self.cache_embs_dir, "embs_train.pkl"), "wb"),
                )

        # normalize embs
        if self.normalize_embs:
            self.normalizer = StandardScaler()
            embs = self.normalizer.fit_transform(embs)

        # train linear
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        if verbose:
            print("set up linear model...")
        if isinstance(self, ClassifierMixin):
            self.linear = LogisticRegressionCV()
        elif isinstance(self, RegressorMixin):
            self.linear = RidgeCV()
        self.linear.fit(embs, y)

        # cache linear coefs
        if cache_linear_coefs:
            if verbose:
                print("caching linear coefs...")
            self.cache_linear_coefs(X, model, tokenizer_embeddings)

        return self

    def _get_model_and_tokenizer(self):
        if self.checkpoint.startswith("hkunlp/instructor-xl"):
            from InstructorEmbedding import INSTRUCTOR
            model = INSTRUCTOR(self.checkpoint)
            tokenizer_embeddings = None
        else:
            tokenizer_embeddings = transformers.AutoTokenizer.from_pretrained(
                self.checkpoint
            )
            if self.embedding_ngram_strategy == 'next_token_distr':
                model = transformers.AutoModelForCausalLM.from_pretrained(
                    self.checkpoint,
                    device_map="auto",
                    torch_dtype=torch.float16,
                )
            else:
                model = transformers.AutoModel.from_pretrained(
                    self.checkpoint).to(device)

        return model.eval(), tokenizer_embeddings

    def cache_linear_coefs(
        self,
        X: ArrayLike,
        model=None,
        tokenizer_embeddings=None,
        renormalize_embs_strategy: str = None,
        batch_size: int = 8,
        verbose: bool = True,
        batch_size_embs: int = 512,
    ):
        """Cache linear coefs for ngrams into a dictionary self.coefs_dict_
        If it already exists, only add linear coefs for new ngrams

        Params
        ------
        renormalize_embs_strategy
            whether to renormalize embeddings before fitting linear model
            (useful if getting a test set that is different from the training)
            values: 'StandardScaler', 'QuantileTransformer'
        batch_size
            batch size to use for calculating embeddings (on gpu at same time)
        batch_size_embs
            batch size to use for number of embeddings stored (on cpu at same time)
        """
        assert renormalize_embs_strategy in [
            None, "StandardScaler", "QuantileTransformer", 'None']
        model, tokenizer_embeddings = self._get_model_and_tokenizer()

        ngrams_list = self._get_unique_ngrams_list(X)

        # dont recompute ngrams we already know
        if hasattr(self, "coefs_dict_"):
            coefs_dict_old = self.coefs_dict_
        else:
            coefs_dict_old = {}
        ngrams_list = [
            ngram for ngram in ngrams_list if not ngram in coefs_dict_old]
        if len(ngrams_list) == 0 and verbose:
            print("\tNothing to update!")
            return

        def normalize_embs(embs, renormalize_embs_strategy):
            if renormalize_embs_strategy in ["StandardScaler", "QuantileTransformer"]:
                if renormalize_embs_strategy == "StandardScaler":
                    embs = StandardScaler().fit_transform(embs)
                elif renormalize_embs_strategy == "QuantileTransformer":
                    embs = QuantileTransformer().fit_transform(embs)
            elif self.normalize_embs:
                embs = self.normalizer.transform(embs)
            return _clean_np_array(embs)

        # calculate linear coefs for each ngram in ngrams_list
        if batch_size_embs is not None:
            coef_embs = self.linear.coef_.squeeze().transpose()
            n_outputs = 1 if coef_embs.ndim == 1 else coef_embs.shape[1]
            linear_coef = np.zeros(shape=(len(ngrams_list), n_outputs))
            # calculate linear coefs in batches
            for i in tqdm(range(0, len(ngrams_list), batch_size_embs)):
                embs = self._get_embs(
                    ngrams_list[i: i + batch_size_embs],
                    model,
                    tokenizer_embeddings,
                    batch_size,
                    summed=False
                )
                embs = normalize_embs(embs, renormalize_embs_strategy)
                linear_coef[i: i + batch_size_embs] = (embs @ coef_embs).reshape(
                    -1, n_outputs
                )
        else:
            embs = self._get_embs(ngrams_list, model,
                                  tokenizer_embeddings, batch_size, summed=False)
            embs = normalize_embs(embs, renormalize_embs_strategy)
            linear_coef = embs @ coef_embs

        # save coefs
        linear_coef = linear_coef.squeeze()
        self.coefs_dict_ = {
            **coefs_dict_old,
            **{ngrams_list[i]: linear_coef[i] for i in range(len(ngrams_list))},
        }
        if verbose:
            print(
                f"\tAfter caching, len(coefs_dict_)={len(self.coefs_dict_)}, up from {len(coefs_dict_old)}")

    def _get_embs(self, X: List[str], model, tokenizer_embeddings, batch_size=8, summed=True):
        '''
        Returns
        -------
        embs: np.array
            num_examples x embedding_size
        '''
        kwargs = dict(
            model=model, tokenizer_embeddings=tokenizer_embeddings, tokenizer_ngrams=self.tokenizer_ngrams,
            checkpoint=self.checkpoint, layer=self.layer, batch_size=batch_size,
            embedding_prefix=self.embedding_prefix, embedding_suffix=self.embedding_suffix,
            prune_stopwords=self.prune_stopwords,
            embedding_strategy=self.embedding_ngram_strategy
        )

        if summed:
            embs = []
            for x in tqdm(X):
                emb = imodelsx.auglinear.embed.embed_and_sum_function(
                    x,
                    ngrams=self.ngrams,
                    all_ngrams=self.all_ngrams,
                    fit_with_ngram_decomposition=self.fit_with_ngram_decomposition,
                    **kwargs,
                )
                embs.append(emb["embs"])
            return _clean_np_array(np.array(embs).squeeze())
        else:
            # get embedding for a list of ngrams
            embs = imodelsx.auglinear.embed.embed_and_sum_function(
                X, ngrams=None, fit_with_ngram_decomposition=False, sum_embeddings=False, **kwargs,
            )["embs"]
            embs = np.array(embs).squeeze()
            assert embs.shape[0] == len(X)
            return _clean_np_array(embs)

    def _get_unique_ngrams_list(self, X):
        all_ngrams = set()
        for x in X:
            seqs = imodelsx.util.generate_ngrams_list(
                x,
                ngrams=self.ngrams,
                tokenizer_ngrams=self.tokenizer_ngrams,
                all_ngrams=self.all_ngrams,
                min_frequency=self.min_frequency,
                prune_stopwords=self.prune_stopwords,
            )
            all_ngrams |= set(seqs)
        return sorted(list(all_ngrams))

    def predict(self, X, warn=True):
        """For regression returns continuous output.
        For classification, returns discrete output.
        """

        check_is_fitted(self)
        preds = self._predict_cached(X, warn=warn)
        if isinstance(self, RegressorMixin):
            return preds
        elif isinstance(self, ClassifierMixin):
            # multiclass classification
            if preds.ndim > 1:
                return np.argmax(preds, axis=1)
            else:
                return (preds + self.linear.intercept_ > 0).astype(int)

    def predict_proba(self, X, warn=True):
        if not isinstance(self, ClassifierMixin):
            raise Exception("predict_proba only available for Classifier")
        check_is_fitted(self)
        preds = self._predict_cached(X, warn=warn)
        if preds.ndim == 1 or preds.shape[1] == 1:
            logits = np.vstack(
                (1 - preds.squeeze(), preds.squeeze())).transpose()
        else:  # multiclass classification
            logits = preds
        return softmax(logits, axis=1)

    def _predict_cached(self, X, warn=False):
        """Predict only the cached coefs in self.coefs_dict_"""
        assert hasattr(self, "coefs_dict_"), "coefs are not cached!"
        preds = []
        n_unseen_ngrams = 0
        n_classes = len(self.classes_)
        for x in X:
            if n_classes > 2:
                pred = np.zeros(n_classes)
            else:
                pred = 0
            seqs = imodelsx.util.generate_ngrams_list(
                x,
                ngrams=self.ngrams,
                tokenizer_ngrams=self.tokenizer_ngrams,
                all_ngrams=self.all_ngrams,
                prune_stopwords=self.prune_stopwords,
            )
            for seq in seqs:
                if seq in self.coefs_dict_:
                    pred += self.coefs_dict_[seq]
                else:
                    n_unseen_ngrams += 1
            preds.append(pred)
        if n_unseen_ngrams > 0 and warn:
            warnings.warn(
                f"Saw an unseen ungram {n_unseen_ngrams} times. \
For better performance, call cache_linear_coefs on the test dataset \
before calling predict."
            )
        return np.array(preds).squeeze()

    def _fit_zeroshot(self, model, tokenizer_embeddings, verbose):
        if verbose:
            print("setting up zero-shot linear model...")
        if len(self.zeroshot_class_dict) > 2:
            raise NotImplementedError(
                'Only binary classification supported for zero-shot')
        embs_dict = {}
        for i, class_num in enumerate(self.zeroshot_class_dict):
            class_names = self.zeroshot_class_dict[class_num]
            if not isinstance(class_names, list):
                class_names = [class_names]
            embs_class = (
                self._get_embs(
                    class_names,
                    model, tokenizer_embeddings,
                    summed=False,
                )
                .reshape((len(class_names), -1))
                .mean(axis=0).squeeze()
            )
            embs_dict[i] = deepcopy(embs_class)

        # take pos class or take difference?
        if self.zeroshot_strategy == 'pos_class':
            emb = embs_dict[1].squeeze()
        elif self.zeroshot_strategy == 'difference':
            emb = (embs_dict[1] - embs_dict[0]).squeeze()

        # set up linear model
        if isinstance(self, ClassifierMixin):
            self.linear = LogisticRegression()
        elif isinstance(self, RegressorMixin):
            self.linear = Ridge()
        self.linear.coef_ = emb / np.linalg.norm(emb)  # - embs[0]
        # self.linear.coef_ -= np.mean(self.linear.coef_)
        # self.linear.coef_ /= np.max(np.abs(self.linear.coef_))
        self.linear.intercept_ = 0  # -np.mean(np.abs(self.linear.coef_))
        return self

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

  • setting and getting parameters used by GridSearchCV and friends;
  • textual and HTML representation displayed in terminals and IDEs;
  • estimator serialization;
  • parameters validation;
  • data validation;
  • feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

AugLinear Class - use either AugLinearClassifier or AugLinearRegressor rather than initializing this class directly.

Parameters

checkpoint : str
Name of model checkpoint (i.e. to be fetch by huggingface)
layer : str
Name of layer to extract embeddings from
ngrams
Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc.
all_ngrams
Whether to use all order ngrams <= ngrams argument
min_frequency
minimum frequency of ngrams to be kept in the ngrams list.
tokenizer_ngrams
if None, defaults to spacy English tokenizer
random_state
random seed for fitting
normalize_embs
whether to normalize embeddings before fitting linear model
cache_embs_dir : str = None,
if not None, directory to save embeddings into
fit_with_ngram_decomposition
whether to fit to aug-linear style (using sum of embeddings of each ngram) if False, fits a typical model and uses ngram decomposition only for prediction / testing Usually, setting this to False will considerably impede performance
embedding_prefix
if checkpoint is an instructor/autoregressive model, prepend this prompt
embedding_suffix
if checkpoint is an autoregressive model, append this prompt
embedding_ngram_strategy
'mean': compute mean over ngram tokens 'next_token_distr': use next token distribution as an embedding (requires AutoModelForCausalLM checkpoint)
zeroshot_class_dict
Maps class numbers to names of the class to use to compute the embedding Ex. {0: 'negative', 1: 'positive'}
zeroshot_strategy
'pos_class' or 'difference'
prune_stopwords
Whether to prune stopwords and ngrams with length < 3

Ancestors

  • sklearn.base.BaseEstimator
  • sklearn.utils._repr_html.base.ReprHTMLMixin
  • sklearn.utils._repr_html.base._HTMLDocumentationLinkMixin
  • sklearn.utils._metadata_requests._MetadataRequester

Subclasses

Methods

def cache_linear_coefs(self,
X: numpy._typing._array_like._Buffer | numpy._typing._array_like._SupportsArray[numpy.dtype[typing.Any]] | numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[typing.Any]]] | bool | int | float | complex | str | bytes | numpy._typing._nested_sequence._NestedSequence[bool | int | float | complex | str | bytes],
model=None,
tokenizer_embeddings=None,
renormalize_embs_strategy: str = None,
batch_size: int = 8,
verbose: bool = True,
batch_size_embs: int = 512)
Expand source code
def cache_linear_coefs(
    self,
    X: ArrayLike,
    model=None,
    tokenizer_embeddings=None,
    renormalize_embs_strategy: str = None,
    batch_size: int = 8,
    verbose: bool = True,
    batch_size_embs: int = 512,
):
    """Cache linear coefs for ngrams into a dictionary self.coefs_dict_
    If it already exists, only add linear coefs for new ngrams

    Params
    ------
    renormalize_embs_strategy
        whether to renormalize embeddings before fitting linear model
        (useful if getting a test set that is different from the training)
        values: 'StandardScaler', 'QuantileTransformer'
    batch_size
        batch size to use for calculating embeddings (on gpu at same time)
    batch_size_embs
        batch size to use for number of embeddings stored (on cpu at same time)
    """
    assert renormalize_embs_strategy in [
        None, "StandardScaler", "QuantileTransformer", 'None']
    model, tokenizer_embeddings = self._get_model_and_tokenizer()

    ngrams_list = self._get_unique_ngrams_list(X)

    # dont recompute ngrams we already know
    if hasattr(self, "coefs_dict_"):
        coefs_dict_old = self.coefs_dict_
    else:
        coefs_dict_old = {}
    ngrams_list = [
        ngram for ngram in ngrams_list if not ngram in coefs_dict_old]
    if len(ngrams_list) == 0 and verbose:
        print("\tNothing to update!")
        return

    def normalize_embs(embs, renormalize_embs_strategy):
        if renormalize_embs_strategy in ["StandardScaler", "QuantileTransformer"]:
            if renormalize_embs_strategy == "StandardScaler":
                embs = StandardScaler().fit_transform(embs)
            elif renormalize_embs_strategy == "QuantileTransformer":
                embs = QuantileTransformer().fit_transform(embs)
        elif self.normalize_embs:
            embs = self.normalizer.transform(embs)
        return _clean_np_array(embs)

    # calculate linear coefs for each ngram in ngrams_list
    if batch_size_embs is not None:
        coef_embs = self.linear.coef_.squeeze().transpose()
        n_outputs = 1 if coef_embs.ndim == 1 else coef_embs.shape[1]
        linear_coef = np.zeros(shape=(len(ngrams_list), n_outputs))
        # calculate linear coefs in batches
        for i in tqdm(range(0, len(ngrams_list), batch_size_embs)):
            embs = self._get_embs(
                ngrams_list[i: i + batch_size_embs],
                model,
                tokenizer_embeddings,
                batch_size,
                summed=False
            )
            embs = normalize_embs(embs, renormalize_embs_strategy)
            linear_coef[i: i + batch_size_embs] = (embs @ coef_embs).reshape(
                -1, n_outputs
            )
    else:
        embs = self._get_embs(ngrams_list, model,
                              tokenizer_embeddings, batch_size, summed=False)
        embs = normalize_embs(embs, renormalize_embs_strategy)
        linear_coef = embs @ coef_embs

    # save coefs
    linear_coef = linear_coef.squeeze()
    self.coefs_dict_ = {
        **coefs_dict_old,
        **{ngrams_list[i]: linear_coef[i] for i in range(len(ngrams_list))},
    }
    if verbose:
        print(
            f"\tAfter caching, len(coefs_dict_)={len(self.coefs_dict_)}, up from {len(coefs_dict_old)}")

Cache linear coefs for ngrams into a dictionary self.coefs_dict_ If it already exists, only add linear coefs for new ngrams

Params

renormalize_embs_strategy whether to renormalize embeddings before fitting linear model (useful if getting a test set that is different from the training) values: 'StandardScaler', 'QuantileTransformer' batch_size batch size to use for calculating embeddings (on gpu at same time) batch_size_embs batch size to use for number of embeddings stored (on cpu at same time)

def fit(self,
X: numpy._typing._array_like._Buffer | numpy._typing._array_like._SupportsArray[numpy.dtype[typing.Any]] | numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[typing.Any]]] | bool | int | float | complex | str | bytes | numpy._typing._nested_sequence._NestedSequence[bool | int | float | complex | str | bytes],
y: numpy._typing._array_like._Buffer | numpy._typing._array_like._SupportsArray[numpy.dtype[typing.Any]] | numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[typing.Any]]] | bool | int | float | complex | str | bytes | numpy._typing._nested_sequence._NestedSequence[bool | int | float | complex | str | bytes],
verbose=True,
cache_linear_coefs: bool = True,
batch_size: int = 8)
Expand source code
def fit(
    self,
    X: ArrayLike,
    y: ArrayLike,
    verbose=True,
    cache_linear_coefs: bool = True,
    batch_size: int = 8,
):
    """Extract embeddings then fit linear model

    Parameters
    ----------
    X: ArrayLike[str]
    y: ArrayLike[str]
    cache_linear_coefs
        Whether to compute and cache linear coefs into self.coefs_dict_
    batch_size, optional
        if not None, batch size to pass while calculating embeddings
    """

    # metadata
    if isinstance(self, ClassifierMixin):
        self.classes_ = unique_labels(y)
    if self.random_state is not None:
        np.random.seed(self.random_state)

    # set up model
    if verbose:
        print("initializing model...")
    model, tokenizer_embeddings = self._get_model_and_tokenizer()

    # if zero-shot, then set linear and return
    if self.zeroshot_class_dict is not None:
        self._fit_zeroshot(model, tokenizer_embeddings, verbose=verbose)
        return self

    # get embs
    if verbose:
        print("calculating embeddings...")
    if self.cache_embs_dir is not None and os.path.exists(
        os.path.join(self.cache_embs_dir, "embs_train.pkl")
    ):
        embs = pkl.load(
            open(os.path.join(self.cache_embs_dir, "embs_train.pkl"), "rb")
        )
    else:
        embs = self._get_embs(
            X, model, tokenizer_embeddings, batch_size, summed=True)
        if self.cache_embs_dir is not None:
            os.makedirs(self.cache_embs_dir, exist_ok=True)
            pkl.dump(
                embs,
                open(os.path.join(self.cache_embs_dir, "embs_train.pkl"), "wb"),
            )

    # normalize embs
    if self.normalize_embs:
        self.normalizer = StandardScaler()
        embs = self.normalizer.fit_transform(embs)

    # train linear
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    if verbose:
        print("set up linear model...")
    if isinstance(self, ClassifierMixin):
        self.linear = LogisticRegressionCV()
    elif isinstance(self, RegressorMixin):
        self.linear = RidgeCV()
    self.linear.fit(embs, y)

    # cache linear coefs
    if cache_linear_coefs:
        if verbose:
            print("caching linear coefs...")
        self.cache_linear_coefs(X, model, tokenizer_embeddings)

    return self

Extract embeddings then fit linear model

Parameters

X : ArrayLike[str]
 
y : ArrayLike[str]
 
cache_linear_coefs
Whether to compute and cache linear coefs into self.coefs_dict_
batch_size, optional
if not None, batch size to pass while calculating embeddings
def predict(self, X, warn=True)
Expand source code
def predict(self, X, warn=True):
    """For regression returns continuous output.
    For classification, returns discrete output.
    """

    check_is_fitted(self)
    preds = self._predict_cached(X, warn=warn)
    if isinstance(self, RegressorMixin):
        return preds
    elif isinstance(self, ClassifierMixin):
        # multiclass classification
        if preds.ndim > 1:
            return np.argmax(preds, axis=1)
        else:
            return (preds + self.linear.intercept_ > 0).astype(int)

For regression returns continuous output. For classification, returns discrete output.

def predict_proba(self, X, warn=True)
Expand source code
def predict_proba(self, X, warn=True):
    if not isinstance(self, ClassifierMixin):
        raise Exception("predict_proba only available for Classifier")
    check_is_fitted(self)
    preds = self._predict_cached(X, warn=warn)
    if preds.ndim == 1 or preds.shape[1] == 1:
        logits = np.vstack(
            (1 - preds.squeeze(), preds.squeeze())).transpose()
    else:  # multiclass classification
        logits = preds
    return softmax(logits, axis=1)
def set_fit_request(self: AugLinear,
*,
batch_size: bool | str | None = '$UNCHANGED$',
cache_linear_coefs: bool | str | None = '$UNCHANGED$',
verbose: bool | str | None = '$UNCHANGED$') ‑> AugLinear
Expand source code
def func(*args, **kw):
    """Updates the `_metadata_request` attribute of the consumer (`instance`)
    for the parameters provided as `**kw`.

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality.
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

Configure whether metadata should be requested to be passed to the fit method.

    Note that this method is only relevant when this estimator is used as a
    sub-estimator within a :term:`meta-estimator` and metadata routing is enabled
    with ``enable_metadata_routing=True`` (see :func:<code>sklearn.set\_config</code>).
    Please check the :ref:`User Guide <metadata_routing>` on how the routing
    mechanism works.

    The options for each parameter are:

    - <code>True</code>: metadata is requested, and passed to <code>fit</code> if provided. The request is ignored if metadata is not provided.

    - <code>False</code>: metadata is not requested and the meta-estimator will not pass it to <code>fit</code>.

    - <code>None</code>: metadata is not requested, and the meta-estimator will raise an error if the user provides it.

    - <code>str</code>: metadata should be passed to the meta-estimator with this given alias instead of the original name.

    The default (<code>sklearn.utils.metadata\_routing.UNCHANGED</code>) retains the
    existing request. This allows you to change the request for some
    parameters and not others.

    !!! versionadded "Added in version:&ensp;1.3"



    Parameters
    ----------
    batch_size : str, True, False, or None,                     default=sklearn.utils.metadata_routing.UNCHANGED
        Metadata routing for <code>batch\_size</code> parameter in <code>fit</code>.

    cache_linear_coefs : str, True, False, or None,                     default=sklearn.utils.metadata_routing.UNCHANGED
        Metadata routing for <code>cache\_linear\_coefs</code> parameter in <code>fit</code>.

    verbose : str, True, False, or None,                     default=sklearn.utils.metadata_routing.UNCHANGED
        Metadata routing for <code>verbose</code> parameter in <code>fit</code>.

    Returns
    -------
    self : object
        The updated object.
def set_predict_proba_request(self: AugLinear,
*,
warn: bool | str | None = '$UNCHANGED$') ‑> AugLinear
Expand source code
def func(*args, **kw):
    """Updates the `_metadata_request` attribute of the consumer (`instance`)
    for the parameters provided as `**kw`.

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality.
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

Configure whether metadata should be requested to be passed to the predict_proba method.

    Note that this method is only relevant when this estimator is used as a
    sub-estimator within a :term:`meta-estimator` and metadata routing is enabled
    with ``enable_metadata_routing=True`` (see :func:<code>sklearn.set\_config</code>).
    Please check the :ref:`User Guide <metadata_routing>` on how the routing
    mechanism works.

    The options for each parameter are:

    - <code>True</code>: metadata is requested, and passed to <code>predict\_proba</code> if provided. The request is ignored if metadata is not provided.

    - <code>False</code>: metadata is not requested and the meta-estimator will not pass it to <code>predict\_proba</code>.

    - <code>None</code>: metadata is not requested, and the meta-estimator will raise an error if the user provides it.

    - <code>str</code>: metadata should be passed to the meta-estimator with this given alias instead of the original name.

    The default (<code>sklearn.utils.metadata\_routing.UNCHANGED</code>) retains the
    existing request. This allows you to change the request for some
    parameters and not others.

    !!! versionadded "Added in version:&ensp;1.3"



    Parameters
    ----------
    warn : str, True, False, or None,                     default=sklearn.utils.metadata_routing.UNCHANGED
        Metadata routing for <code>warn</code> parameter in <code>predict\_proba</code>.

    Returns
    -------
    self : object
        The updated object.
def set_predict_request(self: AugLinear,
*,
warn: bool | str | None = '$UNCHANGED$') ‑> AugLinear
Expand source code
def func(*args, **kw):
    """Updates the `_metadata_request` attribute of the consumer (`instance`)
    for the parameters provided as `**kw`.

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality.
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

Configure whether metadata should be requested to be passed to the predict method.

    Note that this method is only relevant when this estimator is used as a
    sub-estimator within a :term:`meta-estimator` and metadata routing is enabled
    with ``enable_metadata_routing=True`` (see :func:<code>sklearn.set\_config</code>).
    Please check the :ref:`User Guide <metadata_routing>` on how the routing
    mechanism works.

    The options for each parameter are:

    - <code>True</code>: metadata is requested, and passed to <code>predict</code> if provided. The request is ignored if metadata is not provided.

    - <code>False</code>: metadata is not requested and the meta-estimator will not pass it to <code>predict</code>.

    - <code>None</code>: metadata is not requested, and the meta-estimator will raise an error if the user provides it.

    - <code>str</code>: metadata should be passed to the meta-estimator with this given alias instead of the original name.

    The default (<code>sklearn.utils.metadata\_routing.UNCHANGED</code>) retains the
    existing request. This allows you to change the request for some
    parameters and not others.

    !!! versionadded "Added in version:&ensp;1.3"



    Parameters
    ----------
    warn : str, True, False, or None,                     default=sklearn.utils.metadata_routing.UNCHANGED
        Metadata routing for <code>warn</code> parameter in <code>predict</code>.

    Returns
    -------
    self : object
        The updated object.
class AugLinearClassifier (checkpoint: str = 'bert-base-uncased',
layer: str = 'last_hidden_state',
ngrams: int = 2,
all_ngrams: bool = False,
min_frequency: int = 1,
tokenizer_ngrams=None,
random_state=None,
normalize_embs=False,
cache_embs_dir: str = None,
fit_with_ngram_decomposition=True,
embedding_prefix='Represent the short phrase for sentiment classification: ',
embedding_suffix='',
embedding_ngram_strategy='mean',
zeroshot_class_dict: Dict[int, str] = None,
zeroshot_strategy: str = 'pos_class',
prune_stopwords: bool = False)
Expand source code
class AugLinearClassifier(AugLinear, ClassifierMixin):
    ...

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

  • setting and getting parameters used by GridSearchCV and friends;
  • textual and HTML representation displayed in terminals and IDEs;
  • estimator serialization;
  • parameters validation;
  • data validation;
  • feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

AugLinear Class - use either AugLinearClassifier or AugLinearRegressor rather than initializing this class directly.

Parameters

checkpoint : str
Name of model checkpoint (i.e. to be fetch by huggingface)
layer : str
Name of layer to extract embeddings from
ngrams
Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc.
all_ngrams
Whether to use all order ngrams <= ngrams argument
min_frequency
minimum frequency of ngrams to be kept in the ngrams list.
tokenizer_ngrams
if None, defaults to spacy English tokenizer
random_state
random seed for fitting
normalize_embs
whether to normalize embeddings before fitting linear model
cache_embs_dir : str = None,
if not None, directory to save embeddings into
fit_with_ngram_decomposition
whether to fit to aug-linear style (using sum of embeddings of each ngram) if False, fits a typical model and uses ngram decomposition only for prediction / testing Usually, setting this to False will considerably impede performance
embedding_prefix
if checkpoint is an instructor/autoregressive model, prepend this prompt
embedding_suffix
if checkpoint is an autoregressive model, append this prompt
embedding_ngram_strategy
'mean': compute mean over ngram tokens 'next_token_distr': use next token distribution as an embedding (requires AutoModelForCausalLM checkpoint)
zeroshot_class_dict
Maps class numbers to names of the class to use to compute the embedding Ex. {0: 'negative', 1: 'positive'}
zeroshot_strategy
'pos_class' or 'difference'
prune_stopwords
Whether to prune stopwords and ngrams with length < 3

Ancestors

  • AugLinear
  • sklearn.base.BaseEstimator
  • sklearn.utils._repr_html.base.ReprHTMLMixin
  • sklearn.utils._repr_html.base._HTMLDocumentationLinkMixin
  • sklearn.utils._metadata_requests._MetadataRequester
  • sklearn.base.ClassifierMixin

Methods

def set_score_request(self: AugLinearClassifier,
*,
sample_weight: bool | str | None = '$UNCHANGED$') ‑> AugLinearClassifier
Expand source code
def func(*args, **kw):
    """Updates the `_metadata_request` attribute of the consumer (`instance`)
    for the parameters provided as `**kw`.

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality.
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

Configure whether metadata should be requested to be passed to the score method.

    Note that this method is only relevant when this estimator is used as a
    sub-estimator within a :term:`meta-estimator` and metadata routing is enabled
    with ``enable_metadata_routing=True`` (see :func:<code>sklearn.set\_config</code>).
    Please check the :ref:`User Guide <metadata_routing>` on how the routing
    mechanism works.

    The options for each parameter are:

    - <code>True</code>: metadata is requested, and passed to <code>score</code> if provided. The request is ignored if metadata is not provided.

    - <code>False</code>: metadata is not requested and the meta-estimator will not pass it to <code>score</code>.

    - <code>None</code>: metadata is not requested, and the meta-estimator will raise an error if the user provides it.

    - <code>str</code>: metadata should be passed to the meta-estimator with this given alias instead of the original name.

    The default (<code>sklearn.utils.metadata\_routing.UNCHANGED</code>) retains the
    existing request. This allows you to change the request for some
    parameters and not others.

    !!! versionadded "Added in version:&ensp;1.3"



    Parameters
    ----------
    sample_weight : str, True, False, or None,                     default=sklearn.utils.metadata_routing.UNCHANGED
        Metadata routing for <code>sample\_weight</code> parameter in <code>score</code>.

    Returns
    -------
    self : object
        The updated object.

Inherited members

class AugLinearRegressor (checkpoint: str = 'bert-base-uncased',
layer: str = 'last_hidden_state',
ngrams: int = 2,
all_ngrams: bool = False,
min_frequency: int = 1,
tokenizer_ngrams=None,
random_state=None,
normalize_embs=False,
cache_embs_dir: str = None,
fit_with_ngram_decomposition=True,
embedding_prefix='Represent the short phrase for sentiment classification: ',
embedding_suffix='',
embedding_ngram_strategy='mean',
zeroshot_class_dict: Dict[int, str] = None,
zeroshot_strategy: str = 'pos_class',
prune_stopwords: bool = False)
Expand source code
class AugLinearRegressor(AugLinear, RegressorMixin):
    ...

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

  • setting and getting parameters used by GridSearchCV and friends;
  • textual and HTML representation displayed in terminals and IDEs;
  • estimator serialization;
  • parameters validation;
  • data validation;
  • feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

AugLinear Class - use either AugLinearClassifier or AugLinearRegressor rather than initializing this class directly.

Parameters

checkpoint : str
Name of model checkpoint (i.e. to be fetch by huggingface)
layer : str
Name of layer to extract embeddings from
ngrams
Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc.
all_ngrams
Whether to use all order ngrams <= ngrams argument
min_frequency
minimum frequency of ngrams to be kept in the ngrams list.
tokenizer_ngrams
if None, defaults to spacy English tokenizer
random_state
random seed for fitting
normalize_embs
whether to normalize embeddings before fitting linear model
cache_embs_dir : str = None,
if not None, directory to save embeddings into
fit_with_ngram_decomposition
whether to fit to aug-linear style (using sum of embeddings of each ngram) if False, fits a typical model and uses ngram decomposition only for prediction / testing Usually, setting this to False will considerably impede performance
embedding_prefix
if checkpoint is an instructor/autoregressive model, prepend this prompt
embedding_suffix
if checkpoint is an autoregressive model, append this prompt
embedding_ngram_strategy
'mean': compute mean over ngram tokens 'next_token_distr': use next token distribution as an embedding (requires AutoModelForCausalLM checkpoint)
zeroshot_class_dict
Maps class numbers to names of the class to use to compute the embedding Ex. {0: 'negative', 1: 'positive'}
zeroshot_strategy
'pos_class' or 'difference'
prune_stopwords
Whether to prune stopwords and ngrams with length < 3

Ancestors

  • AugLinear
  • sklearn.base.BaseEstimator
  • sklearn.utils._repr_html.base.ReprHTMLMixin
  • sklearn.utils._repr_html.base._HTMLDocumentationLinkMixin
  • sklearn.utils._metadata_requests._MetadataRequester
  • sklearn.base.RegressorMixin

Methods

def set_score_request(self: AugLinearRegressor,
*,
sample_weight: bool | str | None = '$UNCHANGED$') ‑> AugLinearRegressor
Expand source code
def func(*args, **kw):
    """Updates the `_metadata_request` attribute of the consumer (`instance`)
    for the parameters provided as `**kw`.

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality.
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

Configure whether metadata should be requested to be passed to the score method.

    Note that this method is only relevant when this estimator is used as a
    sub-estimator within a :term:`meta-estimator` and metadata routing is enabled
    with ``enable_metadata_routing=True`` (see :func:<code>sklearn.set\_config</code>).
    Please check the :ref:`User Guide <metadata_routing>` on how the routing
    mechanism works.

    The options for each parameter are:

    - <code>True</code>: metadata is requested, and passed to <code>score</code> if provided. The request is ignored if metadata is not provided.

    - <code>False</code>: metadata is not requested and the meta-estimator will not pass it to <code>score</code>.

    - <code>None</code>: metadata is not requested, and the meta-estimator will raise an error if the user provides it.

    - <code>str</code>: metadata should be passed to the meta-estimator with this given alias instead of the original name.

    The default (<code>sklearn.utils.metadata\_routing.UNCHANGED</code>) retains the
    existing request. This allows you to change the request for some
    parameters and not others.

    !!! versionadded "Added in version:&ensp;1.3"



    Parameters
    ----------
    sample_weight : str, True, False, or None,                     default=sklearn.utils.metadata_routing.UNCHANGED
        Metadata routing for <code>sample\_weight</code> parameter in <code>score</code>.

    Returns
    -------
    self : object
        The updated object.

Inherited members