Module `imodelsx.linear_finetune`

Simple scikit-learn interface for finetuning a single linear layer on top of LLM embeddings.

Expand source code

"""
Simple scikit-learn interface for finetuning a single linear layer on top of LLM embeddings.
"""
from numpy.typing import ArrayLike
import numpy as np
from scipy.special import softmax
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.linear_model import LogisticRegressionCV, RidgeCV
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_is_fitted
# from spacy.lang.en import English
from scipy.sparse import issparse
from sklearn.preprocessing import StandardScaler
import transformers
from tqdm import tqdm
import os
import os.path
import warnings
import pickle as pkl
import torch
import torch.nn
from sklearn.exceptions import ConvergenceWarning


class LinearFinetune(BaseEstimator):
    def __init__(
        self,
        checkpoint: str = "bert-base-uncased",
        layer: str = "last_hidden_state",
        random_state=None,
        normalize_embs=False,
        cache_embs_dir: str = None,
        verbose: int = 0,
        device="cuda" if torch.cuda.is_available() else "cpu"
    ):
        """LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.

        Parameters
        ----------
        checkpoint: str
            Name of model checkpoint (i.e. to be fetch by huggingface)
        layer: str
            Name of layer to extract embeddings from
        random_state
            random seed for fitting
        normalize_embs
            whether to normalize embeddings before fitting linear model
        cache_embs_dir, optional
            if not None, directory to save embeddings into

        Example
        -------
        ```
        from imodelsx import LinearFinetuneClassifier
        import datasets
        import numpy as np

        # load data
        dset = datasets.load_dataset('rotten_tomatoes')['train']
        dset = dset.select(np.random.choice(len(dset), size=300, replace=False))
        dset_val = datasets.load_dataset('rotten_tomatoes')['validation']
        dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False))


        # fit a simple one-layer finetune
        m = LinearFinetuneClassifier(
            checkpoint='distilbert-base-uncased',
        )
        m.fit(dset['text'], dset['label'])
        preds = m.predict(dset_val['text'])
        acc = (preds == dset_val['label']).mean()
        print('validation acc', acc)
        ```
        """
        self.checkpoint = checkpoint
        self.layer = layer
        self.random_state = random_state
        self.normalize_embs = normalize_embs
        self.cache_embs_dir = cache_embs_dir
        self.verbose = verbose
        self.device = device
        self._initialize_checkpoint_and_tokenizer()

    def _initialize_checkpoint_and_tokenizer(self):
        self.model = transformers.AutoModel.from_pretrained(
            self.checkpoint).to(self.device)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            self.checkpoint)

    def fit(
        self,
        X_text: ArrayLike,
        y: ArrayLike,
    ):
        """Extract embeddings then fit linear model

        Parameters
        ----------
        X_text: ArrayLike[str]
        y: ArrayLike[str]
        """

        # metadata
        if isinstance(self, ClassifierMixin):
            self.classes_ = unique_labels(y)
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # set up model
        if self.verbose:
            print("initializing model...")

        # get embs
        if self.verbose:
            print("calculating embeddings...")
        if self.cache_embs_dir is not None and os.path.exists(
            os.path.join(self.cache_embs_dir, "embs.pkl")
        ):
            embs = pkl.load(
                open(os.path.join(self.cache_embs_dir, "embs.pkl"), "rb"))
        else:
            embs = self._get_embs(X_text)
            if self.cache_embs_dir is not None:
                os.makedirs(self.cache_embs_dir, exist_ok=True)
                pkl.dump(
                    embs, open(os.path.join(
                        self.cache_embs_dir, "embs.pkl"), "wb")
                )
        if self.normalize_embs:
            self.normalizer = StandardScaler()
            embs = self.normalizer.fit_transform(embs)

        # train linear
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        if self.verbose:
            print("training linear model...")
        if isinstance(self, ClassifierMixin):
            self.linear = LogisticRegressionCV()
        elif isinstance(self, RegressorMixin):
            self.linear = RidgeCV()
        self.linear.fit(embs, y)

        return self

    def _get_embs(self, X_text: ArrayLike):
        embs = []
        if isinstance(X_text, list):
            n = len(X_text)
        else:
            n = X_text.shape[0]
        for i in tqdm(range(n)):
            inputs = self.tokenizer(
                [X_text[i]], padding="max_length", truncation=True, return_tensors="pt"
            )
            inputs = inputs.to(self.model.device)
            output = self.model(**inputs)
            emb = output[self.layer].cpu().detach().numpy()
            if len(emb.shape) == 3:  # includes seq_len
                emb = emb.mean(axis=1)
            embs.append(emb)
        return np.array(embs).squeeze()  # num_examples x embedding_size

    def predict(self, X_text):
        """For regression returns continuous output.
        For classification, returns discrete output.
        """
        check_is_fitted(self)
        embs = self._get_embs(X_text)
        if self.normalize_embs:
            embs = self.normalizer.transform(embs)
        return self.linear.predict(embs)

    def predict_proba(self, X_text):
        check_is_fitted(self)
        embs = self._get_embs(X_text)
        if self.normalize_embs:
            embs = self.normalizer.transform(embs)
        return self.linear.predict_proba(embs)

    def _export_to_pytorch(self):
        assert self.normalize_embs == False, "not implemented"
        weights = self.linear.coef_
        intercept = self.linear.intercept_
        torch_model = LinearModelPytorch(
            in_features=weights.shape[1],
            out_classes=weights.shape[0],
        )
        torch_model.linear.weight = torch.nn.Parameter(
            torch.tensor(weights, dtype=torch.float32))
        torch_model.linear.bias = torch.nn.Parameter(
            torch.tensor(intercept, dtype=torch.float32))
        return torch_model


class LinearFinetuneRegressor(LinearFinetune, RegressorMixin):
    ...


class LinearFinetuneClassifier(LinearFinetune, ClassifierMixin):
    ...


class LinearModelPytorch(torch.nn.Module):
    def __init__(self, in_features, out_classes):
        super(LinearModelPytorch, self).__init__()
        self.linear = torch.nn.Linear(in_features, out_classes)

    def forward(self, x):
        return self.linear(x)


def sigmoid(z):
    """Apply the sigmoid function."""
    return 1 / (1 + np.exp(-z))


if __name__ == "__main__":
    import imodelsx.data

    dset, k = imodelsx.data.load_huggingface_dataset(
        "rotten_tomatoes", subsample_frac=0.01
    )
    text_test = dset["test"]["text"][:100]
    print(dset)
    print(dset["train"])
    print(np.unique(dset["train"]["label"]))

    clf = LinearFinetuneClassifier()
    clf.fit(dset["train"]["text"], dset["train"]["label"])

    print("predicting proba")
    preds_proba = clf.predict_proba(text_test)
    print(preds_proba.shape)

    print('predicting proba pytorch')
    clf_pytorch = clf._export_to_pytorch()
    preds_pytorch = clf_pytorch(torch.tensor(clf._get_embs(text_test)))
    preds_proba_pytorch = sigmoid(preds_pytorch.detach().numpy())
    assert np.allclose(preds_proba[:, 1].flatten(
    ), preds_proba_pytorch.flatten(), atol=1e-3)

    print("predicting")
    preds = clf.predict(text_test)

    assert preds_proba.shape[0] == preds.shape[0]
    print(
        "acc_train",
        np.mean(clf.predict(dset["train"]["text"]) == dset["train"]["label"]),
    )
    print("acc_test", np.mean(preds == dset["test"]["label"]))

Functions

def sigmoid(z)

Apply the sigmoid function.

Expand source code

def sigmoid(z):
    """Apply the sigmoid function."""
    return 1 / (1 + np.exp(-z))

Classes

class LinearFinetune (checkpoint: str = 'bert-base-uncased', layer: str = 'last_hidden_state', random_state=None, normalize_embs=False, cache_embs_dir: str = None, verbose: int = 0, device='cuda')

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.

Parameters

checkpoint : str: Name of model checkpoint (i.e. to be fetch by huggingface)
layer : str: Name of layer to extract embeddings from
random_state: random seed for fitting
normalize_embs: whether to normalize embeddings before fitting linear model
cache_embs_dir, optional: if not None, directory to save embeddings into

Example

from imodelsx import LinearFinetuneClassifier
import datasets
import numpy as np

# load data
dset = datasets.load_dataset('rotten_tomatoes')['train']
dset = dset.select(np.random.choice(len(dset), size=300, replace=False))
dset_val = datasets.load_dataset('rotten_tomatoes')['validation']
dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False))


# fit a simple one-layer finetune
m = LinearFinetuneClassifier(
    checkpoint='distilbert-base-uncased',
)
m.fit(dset['text'], dset['label'])
preds = m.predict(dset_val['text'])
acc = (preds == dset_val['label']).mean()
print('validation acc', acc)

Expand source code

class LinearFinetune(BaseEstimator):
    def __init__(
        self,
        checkpoint: str = "bert-base-uncased",
        layer: str = "last_hidden_state",
        random_state=None,
        normalize_embs=False,
        cache_embs_dir: str = None,
        verbose: int = 0,
        device="cuda" if torch.cuda.is_available() else "cpu"
    ):
        """LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.

        Parameters
        ----------
        checkpoint: str
            Name of model checkpoint (i.e. to be fetch by huggingface)
        layer: str
            Name of layer to extract embeddings from
        random_state
            random seed for fitting
        normalize_embs
            whether to normalize embeddings before fitting linear model
        cache_embs_dir, optional
            if not None, directory to save embeddings into

        Example
        -------
        ```
        from imodelsx import LinearFinetuneClassifier
        import datasets
        import numpy as np

        # load data
        dset = datasets.load_dataset('rotten_tomatoes')['train']
        dset = dset.select(np.random.choice(len(dset), size=300, replace=False))
        dset_val = datasets.load_dataset('rotten_tomatoes')['validation']
        dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False))


        # fit a simple one-layer finetune
        m = LinearFinetuneClassifier(
            checkpoint='distilbert-base-uncased',
        )
        m.fit(dset['text'], dset['label'])
        preds = m.predict(dset_val['text'])
        acc = (preds == dset_val['label']).mean()
        print('validation acc', acc)
        ```
        """
        self.checkpoint = checkpoint
        self.layer = layer
        self.random_state = random_state
        self.normalize_embs = normalize_embs
        self.cache_embs_dir = cache_embs_dir
        self.verbose = verbose
        self.device = device
        self._initialize_checkpoint_and_tokenizer()

    def _initialize_checkpoint_and_tokenizer(self):
        self.model = transformers.AutoModel.from_pretrained(
            self.checkpoint).to(self.device)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            self.checkpoint)

    def fit(
        self,
        X_text: ArrayLike,
        y: ArrayLike,
    ):
        """Extract embeddings then fit linear model

        Parameters
        ----------
        X_text: ArrayLike[str]
        y: ArrayLike[str]
        """

        # metadata
        if isinstance(self, ClassifierMixin):
            self.classes_ = unique_labels(y)
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # set up model
        if self.verbose:
            print("initializing model...")

        # get embs
        if self.verbose:
            print("calculating embeddings...")
        if self.cache_embs_dir is not None and os.path.exists(
            os.path.join(self.cache_embs_dir, "embs.pkl")
        ):
            embs = pkl.load(
                open(os.path.join(self.cache_embs_dir, "embs.pkl"), "rb"))
        else:
            embs = self._get_embs(X_text)
            if self.cache_embs_dir is not None:
                os.makedirs(self.cache_embs_dir, exist_ok=True)
                pkl.dump(
                    embs, open(os.path.join(
                        self.cache_embs_dir, "embs.pkl"), "wb")
                )
        if self.normalize_embs:
            self.normalizer = StandardScaler()
            embs = self.normalizer.fit_transform(embs)

        # train linear
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        if self.verbose:
            print("training linear model...")
        if isinstance(self, ClassifierMixin):
            self.linear = LogisticRegressionCV()
        elif isinstance(self, RegressorMixin):
            self.linear = RidgeCV()
        self.linear.fit(embs, y)

        return self

    def _get_embs(self, X_text: ArrayLike):
        embs = []
        if isinstance(X_text, list):
            n = len(X_text)
        else:
            n = X_text.shape[0]
        for i in tqdm(range(n)):
            inputs = self.tokenizer(
                [X_text[i]], padding="max_length", truncation=True, return_tensors="pt"
            )
            inputs = inputs.to(self.model.device)
            output = self.model(**inputs)
            emb = output[self.layer].cpu().detach().numpy()
            if len(emb.shape) == 3:  # includes seq_len
                emb = emb.mean(axis=1)
            embs.append(emb)
        return np.array(embs).squeeze()  # num_examples x embedding_size

    def predict(self, X_text):
        """For regression returns continuous output.
        For classification, returns discrete output.
        """
        check_is_fitted(self)
        embs = self._get_embs(X_text)
        if self.normalize_embs:
            embs = self.normalizer.transform(embs)
        return self.linear.predict(embs)

    def predict_proba(self, X_text):
        check_is_fitted(self)
        embs = self._get_embs(X_text)
        if self.normalize_embs:
            embs = self.normalizer.transform(embs)
        return self.linear.predict_proba(embs)

    def _export_to_pytorch(self):
        assert self.normalize_embs == False, "not implemented"
        weights = self.linear.coef_
        intercept = self.linear.intercept_
        torch_model = LinearModelPytorch(
            in_features=weights.shape[1],
            out_classes=weights.shape[0],
        )
        torch_model.linear.weight = torch.nn.Parameter(
            torch.tensor(weights, dtype=torch.float32))
        torch_model.linear.bias = torch.nn.Parameter(
            torch.tensor(intercept, dtype=torch.float32))
        return torch_model

Ancestors

sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester

Methods

def fit(self, X_text: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], y: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]])

Extract embeddings then fit linear model

Parameters

X_text : ArrayLike[str]
y : ArrayLike[str]

Expand source code

def fit(
    self,
    X_text: ArrayLike,
    y: ArrayLike,
):
    """Extract embeddings then fit linear model

    Parameters
    ----------
    X_text: ArrayLike[str]
    y: ArrayLike[str]
    """

    # metadata
    if isinstance(self, ClassifierMixin):
        self.classes_ = unique_labels(y)
    if self.random_state is not None:
        np.random.seed(self.random_state)

    # set up model
    if self.verbose:
        print("initializing model...")

    # get embs
    if self.verbose:
        print("calculating embeddings...")
    if self.cache_embs_dir is not None and os.path.exists(
        os.path.join(self.cache_embs_dir, "embs.pkl")
    ):
        embs = pkl.load(
            open(os.path.join(self.cache_embs_dir, "embs.pkl"), "rb"))
    else:
        embs = self._get_embs(X_text)
        if self.cache_embs_dir is not None:
            os.makedirs(self.cache_embs_dir, exist_ok=True)
            pkl.dump(
                embs, open(os.path.join(
                    self.cache_embs_dir, "embs.pkl"), "wb")
            )
    if self.normalize_embs:
        self.normalizer = StandardScaler()
        embs = self.normalizer.fit_transform(embs)

    # train linear
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    if self.verbose:
        print("training linear model...")
    if isinstance(self, ClassifierMixin):
        self.linear = LogisticRegressionCV()
    elif isinstance(self, RegressorMixin):
        self.linear = RidgeCV()
    self.linear.fit(embs, y)

    return self

def predict(self, X_text)

For regression returns continuous output. For classification, returns discrete output.

Expand source code

def predict(self, X_text):
    """For regression returns continuous output.
    For classification, returns discrete output.
    """
    check_is_fitted(self)
    embs = self._get_embs(X_text)
    if self.normalize_embs:
        embs = self.normalizer.transform(embs)
    return self.linear.predict(embs)

def predict_proba(self, X_text)

Expand source code

def predict_proba(self, X_text):
    check_is_fitted(self)
    embs = self._get_embs(X_text)
    if self.normalize_embs:
        embs = self.normalizer.transform(embs)
    return self.linear.predict_proba(embs)

def set_fit_request(self: LinearFinetune, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetune

Request metadata passed to the fit method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to fit if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to fit.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

X_text : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for X_text parameter in fit.

Returns

self : object: The updated object.

Expand source code

def func(**kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments"
            f" are: {set(self.keys)}"
        )

    requests = instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    instance._metadata_request = requests

    return instance

def set_predict_proba_request(self: LinearFinetune, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetune

Request metadata passed to the predict_proba method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to predict_proba if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to predict_proba.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

X_text : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for X_text parameter in predict_proba.

Returns

self : object: The updated object.

Expand source code

def func(**kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments"
            f" are: {set(self.keys)}"
        )

    requests = instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    instance._metadata_request = requests

    return instance

def set_predict_request(self: LinearFinetune, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetune

Request metadata passed to the predict method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to predict if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to predict.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

X_text : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for X_text parameter in predict.

Returns

self : object: The updated object.

Expand source code

def func(**kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments"
            f" are: {set(self.keys)}"
        )

    requests = instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    instance._metadata_request = requests

    return instance

class LinearFinetuneClassifier (checkpoint: str = 'bert-base-uncased', layer: str = 'last_hidden_state', random_state=None, normalize_embs=False, cache_embs_dir: str = None, verbose: int = 0, device='cuda')

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.

Parameters

checkpoint : str: Name of model checkpoint (i.e. to be fetch by huggingface)
layer : str: Name of layer to extract embeddings from
random_state: random seed for fitting
normalize_embs: whether to normalize embeddings before fitting linear model
cache_embs_dir, optional: if not None, directory to save embeddings into

Example

from imodelsx import LinearFinetuneClassifier
import datasets
import numpy as np

# load data
dset = datasets.load_dataset('rotten_tomatoes')['train']
dset = dset.select(np.random.choice(len(dset), size=300, replace=False))
dset_val = datasets.load_dataset('rotten_tomatoes')['validation']
dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False))


# fit a simple one-layer finetune
m = LinearFinetuneClassifier(
    checkpoint='distilbert-base-uncased',
)
m.fit(dset['text'], dset['label'])
preds = m.predict(dset_val['text'])
acc = (preds == dset_val['label']).mean()
print('validation acc', acc)

Expand source code

class LinearFinetuneClassifier(LinearFinetune, ClassifierMixin):
    ...

Ancestors

LinearFinetune
sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester
sklearn.base.ClassifierMixin

Methods

def set_score_request(self: LinearFinetuneClassifier, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetuneClassifier

Request metadata passed to the score method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to score if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to score.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

sample_weight : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for sample_weight parameter in score.

Returns

self : object: The updated object.

Expand source code

def func(**kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments"
            f" are: {set(self.keys)}"
        )

    requests = instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    instance._metadata_request = requests

    return instance

Inherited members

LinearFinetune:
- fit
- predict
- set_fit_request
- set_predict_proba_request
- set_predict_request

class LinearFinetuneRegressor (checkpoint: str = 'bert-base-uncased', layer: str = 'last_hidden_state', random_state=None, normalize_embs=False, cache_embs_dir: str = None, verbose: int = 0, device='cuda')

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])

LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.

Parameters

checkpoint : str: Name of model checkpoint (i.e. to be fetch by huggingface)
layer : str: Name of layer to extract embeddings from
random_state: random seed for fitting
normalize_embs: whether to normalize embeddings before fitting linear model
cache_embs_dir, optional: if not None, directory to save embeddings into

Example

from imodelsx import LinearFinetuneClassifier
import datasets
import numpy as np

# load data
dset = datasets.load_dataset('rotten_tomatoes')['train']
dset = dset.select(np.random.choice(len(dset), size=300, replace=False))
dset_val = datasets.load_dataset('rotten_tomatoes')['validation']
dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False))


# fit a simple one-layer finetune
m = LinearFinetuneClassifier(
    checkpoint='distilbert-base-uncased',
)
m.fit(dset['text'], dset['label'])
preds = m.predict(dset_val['text'])
acc = (preds == dset_val['label']).mean()
print('validation acc', acc)

Expand source code

class LinearFinetuneRegressor(LinearFinetune, RegressorMixin):
    ...

Ancestors

LinearFinetune
sklearn.base.BaseEstimator
sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
sklearn.utils._metadata_requests._MetadataRequester
sklearn.base.RegressorMixin

Methods

def set_score_request(self: LinearFinetuneRegressor, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetuneRegressor

Request metadata passed to the score method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to score if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to score.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

Added in version: 1.3

Note

This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters

sample_weight : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED: Metadata routing for sample_weight parameter in score.

Returns

self : object: The updated object.

Expand source code

def func(**kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments"
            f" are: {set(self.keys)}"
        )

    requests = instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    instance._metadata_request = requests

    return instance

Inherited members

LinearFinetune:
- fit
- predict
- set_fit_request
- set_predict_proba_request
- set_predict_request

class LinearModelPytorch (in_features, out_classes)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

Note

As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initialize internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class LinearModelPytorch(torch.nn.Module):
    def __init__(self, in_features, out_classes):
        super(LinearModelPytorch, self).__init__()
        self.linear = torch.nn.Linear(in_features, out_classes)

    def forward(self, x):
        return self.linear(x)

Ancestors

torch.nn.modules.module.Module

Methods

def forward(self, x) ‑> Callable[..., Any]

Define the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Expand source code

def forward(self, x):
    return self.linear(x)