Module imodelsx.linear_finetune
Simple scikit-learn interface for finetuning a single linear layer on top of LLM embeddings.
Expand source code
"""
Simple scikit-learn interface for finetuning a single linear layer on top of LLM embeddings.
"""
from numpy.typing import ArrayLike
import numpy as np
from scipy.special import softmax
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.linear_model import LogisticRegressionCV, RidgeCV
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_is_fitted
# from spacy.lang.en import English
from scipy.sparse import issparse
from sklearn.preprocessing import StandardScaler
import transformers
from tqdm import tqdm
import os
import os.path
import warnings
import pickle as pkl
import torch
import torch.nn
from sklearn.exceptions import ConvergenceWarning
class LinearFinetune(BaseEstimator):
def __init__(
self,
checkpoint: str = "bert-base-uncased",
layer: str = "last_hidden_state",
random_state=None,
normalize_embs=False,
cache_embs_dir: str = None,
verbose: int = 0,
device="cuda" if torch.cuda.is_available() else "cpu"
):
"""LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.
Parameters
----------
checkpoint: str
Name of model checkpoint (i.e. to be fetch by huggingface)
layer: str
Name of layer to extract embeddings from
random_state
random seed for fitting
normalize_embs
whether to normalize embeddings before fitting linear model
cache_embs_dir, optional
if not None, directory to save embeddings into
Example
-------
```
from imodelsx import LinearFinetuneClassifier
import datasets
import numpy as np
# load data
dset = datasets.load_dataset('rotten_tomatoes')['train']
dset = dset.select(np.random.choice(len(dset), size=300, replace=False))
dset_val = datasets.load_dataset('rotten_tomatoes')['validation']
dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False))
# fit a simple one-layer finetune
m = LinearFinetuneClassifier(
checkpoint='distilbert-base-uncased',
)
m.fit(dset['text'], dset['label'])
preds = m.predict(dset_val['text'])
acc = (preds == dset_val['label']).mean()
print('validation acc', acc)
```
"""
self.checkpoint = checkpoint
self.layer = layer
self.random_state = random_state
self.normalize_embs = normalize_embs
self.cache_embs_dir = cache_embs_dir
self.verbose = verbose
self.device = device
self._initialize_checkpoint_and_tokenizer()
def _initialize_checkpoint_and_tokenizer(self):
self.model = transformers.AutoModel.from_pretrained(
self.checkpoint).to(self.device)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
self.checkpoint)
def fit(
self,
X_text: ArrayLike,
y: ArrayLike,
):
"""Extract embeddings then fit linear model
Parameters
----------
X_text: ArrayLike[str]
y: ArrayLike[str]
"""
# metadata
if isinstance(self, ClassifierMixin):
self.classes_ = unique_labels(y)
if self.random_state is not None:
np.random.seed(self.random_state)
# set up model
if self.verbose:
print("initializing model...")
# get embs
if self.verbose:
print("calculating embeddings...")
if self.cache_embs_dir is not None and os.path.exists(
os.path.join(self.cache_embs_dir, "embs.pkl")
):
embs = pkl.load(
open(os.path.join(self.cache_embs_dir, "embs.pkl"), "rb"))
else:
embs = self._get_embs(X_text)
if self.cache_embs_dir is not None:
os.makedirs(self.cache_embs_dir, exist_ok=True)
pkl.dump(
embs, open(os.path.join(
self.cache_embs_dir, "embs.pkl"), "wb")
)
if self.normalize_embs:
self.normalizer = StandardScaler()
embs = self.normalizer.fit_transform(embs)
# train linear
warnings.filterwarnings("ignore", category=ConvergenceWarning)
if self.verbose:
print("training linear model...")
if isinstance(self, ClassifierMixin):
self.linear = LogisticRegressionCV()
elif isinstance(self, RegressorMixin):
self.linear = RidgeCV()
self.linear.fit(embs, y)
return self
def _get_embs(self, X_text: ArrayLike):
embs = []
if isinstance(X_text, list):
n = len(X_text)
else:
n = X_text.shape[0]
for i in tqdm(range(n)):
inputs = self.tokenizer(
[X_text[i]], padding="max_length", truncation=True, return_tensors="pt"
)
inputs = inputs.to(self.model.device)
output = self.model(**inputs)
emb = output[self.layer].cpu().detach().numpy()
if len(emb.shape) == 3: # includes seq_len
emb = emb.mean(axis=1)
embs.append(emb)
return np.array(embs).squeeze() # num_examples x embedding_size
def predict(self, X_text):
"""For regression returns continuous output.
For classification, returns discrete output.
"""
check_is_fitted(self)
embs = self._get_embs(X_text)
if self.normalize_embs:
embs = self.normalizer.transform(embs)
return self.linear.predict(embs)
def predict_proba(self, X_text):
check_is_fitted(self)
embs = self._get_embs(X_text)
if self.normalize_embs:
embs = self.normalizer.transform(embs)
return self.linear.predict_proba(embs)
def _export_to_pytorch(self):
assert self.normalize_embs == False, "not implemented"
weights = self.linear.coef_
intercept = self.linear.intercept_
torch_model = LinearModelPytorch(
in_features=weights.shape[1],
out_classes=weights.shape[0],
)
torch_model.linear.weight = torch.nn.Parameter(
torch.tensor(weights, dtype=torch.float32))
torch_model.linear.bias = torch.nn.Parameter(
torch.tensor(intercept, dtype=torch.float32))
return torch_model
class LinearFinetuneRegressor(LinearFinetune, RegressorMixin):
...
class LinearFinetuneClassifier(LinearFinetune, ClassifierMixin):
...
class LinearModelPytorch(torch.nn.Module):
def __init__(self, in_features, out_classes):
super(LinearModelPytorch, self).__init__()
self.linear = torch.nn.Linear(in_features, out_classes)
def forward(self, x):
return self.linear(x)
def sigmoid(z):
"""Apply the sigmoid function."""
return 1 / (1 + np.exp(-z))
if __name__ == "__main__":
import imodelsx.data
dset, k = imodelsx.data.load_huggingface_dataset(
"rotten_tomatoes", subsample_frac=0.01
)
text_test = dset["test"]["text"][:100]
print(dset)
print(dset["train"])
print(np.unique(dset["train"]["label"]))
clf = LinearFinetuneClassifier()
clf.fit(dset["train"]["text"], dset["train"]["label"])
print("predicting proba")
preds_proba = clf.predict_proba(text_test)
print(preds_proba.shape)
print('predicting proba pytorch')
clf_pytorch = clf._export_to_pytorch()
preds_pytorch = clf_pytorch(torch.tensor(clf._get_embs(text_test)))
preds_proba_pytorch = sigmoid(preds_pytorch.detach().numpy())
assert np.allclose(preds_proba[:, 1].flatten(
), preds_proba_pytorch.flatten(), atol=1e-3)
print("predicting")
preds = clf.predict(text_test)
assert preds_proba.shape[0] == preds.shape[0]
print(
"acc_train",
np.mean(clf.predict(dset["train"]["text"]) == dset["train"]["label"]),
)
print("acc_test", np.mean(preds == dset["test"]["label"]))
Functions
def sigmoid(z)
-
Apply the sigmoid function.
Expand source code
def sigmoid(z): """Apply the sigmoid function.""" return 1 / (1 + np.exp(-z))
Classes
class LinearFinetune (checkpoint: str = 'bert-base-uncased', layer: str = 'last_hidden_state', random_state=None, normalize_embs=False, cache_embs_dir: str = None, verbose: int = 0, device='cuda')
-
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Notes
All estimators should specify all the parameters that can be set at the class level in their
__init__
as explicit keyword arguments (no*args
or**kwargs
).Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator >>> class MyEstimator(BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=2) >>> estimator.get_params() {'param': 2} >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([2, 2, 2]) >>> estimator.set_params(param=3).fit(X, y).predict(X) array([3, 3, 3])
LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.
Parameters
checkpoint
:str
- Name of model checkpoint (i.e. to be fetch by huggingface)
layer
:str
- Name of layer to extract embeddings from
random_state
- random seed for fitting
normalize_embs
- whether to normalize embeddings before fitting linear model
cache_embs_dir
,optional
- if not None, directory to save embeddings into
Example
from imodelsx import LinearFinetuneClassifier import datasets import numpy as np # load data dset = datasets.load_dataset('rotten_tomatoes')['train'] dset = dset.select(np.random.choice(len(dset), size=300, replace=False)) dset_val = datasets.load_dataset('rotten_tomatoes')['validation'] dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False)) # fit a simple one-layer finetune m = LinearFinetuneClassifier( checkpoint='distilbert-base-uncased', ) m.fit(dset['text'], dset['label']) preds = m.predict(dset_val['text']) acc = (preds == dset_val['label']).mean() print('validation acc', acc)
Expand source code
class LinearFinetune(BaseEstimator): def __init__( self, checkpoint: str = "bert-base-uncased", layer: str = "last_hidden_state", random_state=None, normalize_embs=False, cache_embs_dir: str = None, verbose: int = 0, device="cuda" if torch.cuda.is_available() else "cpu" ): """LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly. Parameters ---------- checkpoint: str Name of model checkpoint (i.e. to be fetch by huggingface) layer: str Name of layer to extract embeddings from random_state random seed for fitting normalize_embs whether to normalize embeddings before fitting linear model cache_embs_dir, optional if not None, directory to save embeddings into Example ------- ``` from imodelsx import LinearFinetuneClassifier import datasets import numpy as np # load data dset = datasets.load_dataset('rotten_tomatoes')['train'] dset = dset.select(np.random.choice(len(dset), size=300, replace=False)) dset_val = datasets.load_dataset('rotten_tomatoes')['validation'] dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False)) # fit a simple one-layer finetune m = LinearFinetuneClassifier( checkpoint='distilbert-base-uncased', ) m.fit(dset['text'], dset['label']) preds = m.predict(dset_val['text']) acc = (preds == dset_val['label']).mean() print('validation acc', acc) ``` """ self.checkpoint = checkpoint self.layer = layer self.random_state = random_state self.normalize_embs = normalize_embs self.cache_embs_dir = cache_embs_dir self.verbose = verbose self.device = device self._initialize_checkpoint_and_tokenizer() def _initialize_checkpoint_and_tokenizer(self): self.model = transformers.AutoModel.from_pretrained( self.checkpoint).to(self.device) self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.checkpoint) def fit( self, X_text: ArrayLike, y: ArrayLike, ): """Extract embeddings then fit linear model Parameters ---------- X_text: ArrayLike[str] y: ArrayLike[str] """ # metadata if isinstance(self, ClassifierMixin): self.classes_ = unique_labels(y) if self.random_state is not None: np.random.seed(self.random_state) # set up model if self.verbose: print("initializing model...") # get embs if self.verbose: print("calculating embeddings...") if self.cache_embs_dir is not None and os.path.exists( os.path.join(self.cache_embs_dir, "embs.pkl") ): embs = pkl.load( open(os.path.join(self.cache_embs_dir, "embs.pkl"), "rb")) else: embs = self._get_embs(X_text) if self.cache_embs_dir is not None: os.makedirs(self.cache_embs_dir, exist_ok=True) pkl.dump( embs, open(os.path.join( self.cache_embs_dir, "embs.pkl"), "wb") ) if self.normalize_embs: self.normalizer = StandardScaler() embs = self.normalizer.fit_transform(embs) # train linear warnings.filterwarnings("ignore", category=ConvergenceWarning) if self.verbose: print("training linear model...") if isinstance(self, ClassifierMixin): self.linear = LogisticRegressionCV() elif isinstance(self, RegressorMixin): self.linear = RidgeCV() self.linear.fit(embs, y) return self def _get_embs(self, X_text: ArrayLike): embs = [] if isinstance(X_text, list): n = len(X_text) else: n = X_text.shape[0] for i in tqdm(range(n)): inputs = self.tokenizer( [X_text[i]], padding="max_length", truncation=True, return_tensors="pt" ) inputs = inputs.to(self.model.device) output = self.model(**inputs) emb = output[self.layer].cpu().detach().numpy() if len(emb.shape) == 3: # includes seq_len emb = emb.mean(axis=1) embs.append(emb) return np.array(embs).squeeze() # num_examples x embedding_size def predict(self, X_text): """For regression returns continuous output. For classification, returns discrete output. """ check_is_fitted(self) embs = self._get_embs(X_text) if self.normalize_embs: embs = self.normalizer.transform(embs) return self.linear.predict(embs) def predict_proba(self, X_text): check_is_fitted(self) embs = self._get_embs(X_text) if self.normalize_embs: embs = self.normalizer.transform(embs) return self.linear.predict_proba(embs) def _export_to_pytorch(self): assert self.normalize_embs == False, "not implemented" weights = self.linear.coef_ intercept = self.linear.intercept_ torch_model = LinearModelPytorch( in_features=weights.shape[1], out_classes=weights.shape[0], ) torch_model.linear.weight = torch.nn.Parameter( torch.tensor(weights, dtype=torch.float32)) torch_model.linear.bias = torch.nn.Parameter( torch.tensor(intercept, dtype=torch.float32)) return torch_model
Ancestors
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
Subclasses
Methods
def fit(self, X_text: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], y: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]])
-
Extract embeddings then fit linear model
Parameters
X_text
:ArrayLike[str]
y
:ArrayLike[str]
Expand source code
def fit( self, X_text: ArrayLike, y: ArrayLike, ): """Extract embeddings then fit linear model Parameters ---------- X_text: ArrayLike[str] y: ArrayLike[str] """ # metadata if isinstance(self, ClassifierMixin): self.classes_ = unique_labels(y) if self.random_state is not None: np.random.seed(self.random_state) # set up model if self.verbose: print("initializing model...") # get embs if self.verbose: print("calculating embeddings...") if self.cache_embs_dir is not None and os.path.exists( os.path.join(self.cache_embs_dir, "embs.pkl") ): embs = pkl.load( open(os.path.join(self.cache_embs_dir, "embs.pkl"), "rb")) else: embs = self._get_embs(X_text) if self.cache_embs_dir is not None: os.makedirs(self.cache_embs_dir, exist_ok=True) pkl.dump( embs, open(os.path.join( self.cache_embs_dir, "embs.pkl"), "wb") ) if self.normalize_embs: self.normalizer = StandardScaler() embs = self.normalizer.fit_transform(embs) # train linear warnings.filterwarnings("ignore", category=ConvergenceWarning) if self.verbose: print("training linear model...") if isinstance(self, ClassifierMixin): self.linear = LogisticRegressionCV() elif isinstance(self, RegressorMixin): self.linear = RidgeCV() self.linear.fit(embs, y) return self
def predict(self, X_text)
-
For regression returns continuous output. For classification, returns discrete output.
Expand source code
def predict(self, X_text): """For regression returns continuous output. For classification, returns discrete output. """ check_is_fitted(self) embs = self._get_embs(X_text) if self.normalize_embs: embs = self.normalizer.transform(embs) return self.linear.predict(embs)
def predict_proba(self, X_text)
-
Expand source code
def predict_proba(self, X_text): check_is_fitted(self) embs = self._get_embs(X_text) if self.normalize_embs: embs = self.normalizer.transform(embs) return self.linear.predict_proba(embs)
def set_fit_request(self: LinearFinetune, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetune
-
Request metadata passed to the
fit
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed tofit
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it tofit
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
X_text
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
X_text
parameter infit
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
def set_predict_proba_request(self: LinearFinetune, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetune
-
Request metadata passed to the
predict_proba
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed topredict_proba
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it topredict_proba
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
X_text
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
X_text
parameter inpredict_proba
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
def set_predict_request(self: LinearFinetune, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetune
-
Request metadata passed to the
predict
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed topredict
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it topredict
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
X_text
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
X_text
parameter inpredict
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
- setting and getting parameters used by
class LinearFinetuneClassifier (checkpoint: str = 'bert-base-uncased', layer: str = 'last_hidden_state', random_state=None, normalize_embs=False, cache_embs_dir: str = None, verbose: int = 0, device='cuda')
-
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Notes
All estimators should specify all the parameters that can be set at the class level in their
__init__
as explicit keyword arguments (no*args
or**kwargs
).Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator >>> class MyEstimator(BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=2) >>> estimator.get_params() {'param': 2} >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([2, 2, 2]) >>> estimator.set_params(param=3).fit(X, y).predict(X) array([3, 3, 3])
LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.
Parameters
checkpoint
:str
- Name of model checkpoint (i.e. to be fetch by huggingface)
layer
:str
- Name of layer to extract embeddings from
random_state
- random seed for fitting
normalize_embs
- whether to normalize embeddings before fitting linear model
cache_embs_dir
,optional
- if not None, directory to save embeddings into
Example
from imodelsx import LinearFinetuneClassifier import datasets import numpy as np # load data dset = datasets.load_dataset('rotten_tomatoes')['train'] dset = dset.select(np.random.choice(len(dset), size=300, replace=False)) dset_val = datasets.load_dataset('rotten_tomatoes')['validation'] dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False)) # fit a simple one-layer finetune m = LinearFinetuneClassifier( checkpoint='distilbert-base-uncased', ) m.fit(dset['text'], dset['label']) preds = m.predict(dset_val['text']) acc = (preds == dset_val['label']).mean() print('validation acc', acc)
Expand source code
class LinearFinetuneClassifier(LinearFinetune, ClassifierMixin): ...
Ancestors
- LinearFinetune
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.ClassifierMixin
Methods
def set_score_request(self: LinearFinetuneClassifier, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetuneClassifier
-
Request metadata passed to the
score
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed toscore
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it toscore
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
sample_weight
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
sample_weight
parameter inscore
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
Inherited members
- setting and getting parameters used by
class LinearFinetuneRegressor (checkpoint: str = 'bert-base-uncased', layer: str = 'last_hidden_state', random_state=None, normalize_embs=False, cache_embs_dir: str = None, verbose: int = 0, device='cuda')
-
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Notes
All estimators should specify all the parameters that can be set at the class level in their
__init__
as explicit keyword arguments (no*args
or**kwargs
).Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator >>> class MyEstimator(BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=2) >>> estimator.get_params() {'param': 2} >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([2, 2, 2]) >>> estimator.set_params(param=3).fit(X, y).predict(X) array([3, 3, 3])
LinearFinetune Class - use either LinearFinetuneClassifier or LinearFinetuneRegressor rather than initializing this class directly.
Parameters
checkpoint
:str
- Name of model checkpoint (i.e. to be fetch by huggingface)
layer
:str
- Name of layer to extract embeddings from
random_state
- random seed for fitting
normalize_embs
- whether to normalize embeddings before fitting linear model
cache_embs_dir
,optional
- if not None, directory to save embeddings into
Example
from imodelsx import LinearFinetuneClassifier import datasets import numpy as np # load data dset = datasets.load_dataset('rotten_tomatoes')['train'] dset = dset.select(np.random.choice(len(dset), size=300, replace=False)) dset_val = datasets.load_dataset('rotten_tomatoes')['validation'] dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False)) # fit a simple one-layer finetune m = LinearFinetuneClassifier( checkpoint='distilbert-base-uncased', ) m.fit(dset['text'], dset['label']) preds = m.predict(dset_val['text']) acc = (preds == dset_val['label']).mean() print('validation acc', acc)
Expand source code
class LinearFinetuneRegressor(LinearFinetune, RegressorMixin): ...
Ancestors
- LinearFinetune
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.RegressorMixin
Methods
def set_score_request(self: LinearFinetuneRegressor, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearFinetuneRegressor
-
Request metadata passed to the
score
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed toscore
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it toscore
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
sample_weight
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
sample_weight
parameter inscore
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
Inherited members
- setting and getting parameters used by
class LinearModelPytorch (in_features, out_classes)
-
Base class for all neural network modules.
Your models should also subclass this class.
Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::
import torch.nn as nn import torch.nn.functional as F class Model(nn.Module): def __init__(self): super().__init__() self.conv1 = nn.Conv2d(1, 20, 5) self.conv2 = nn.Conv2d(20, 20, 5) def forward(self, x): x = F.relu(self.conv1(x)) return F.relu(self.conv2(x))
Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:
to
, etc.Note
As per the example above, an
__init__()
call to the parent class must be made before assignment on the child.:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool
Initialize internal Module state, shared by both nn.Module and ScriptModule.
Expand source code
class LinearModelPytorch(torch.nn.Module): def __init__(self, in_features, out_classes): super(LinearModelPytorch, self).__init__() self.linear = torch.nn.Linear(in_features, out_classes) def forward(self, x): return self.linear(x)
Ancestors
- torch.nn.modules.module.Module
Methods
def forward(self, x) ‑> Callable[..., Any]
-
Define the computation performed at every call.
Should be overridden by all subclasses.
Note
Although the recipe for forward pass needs to be defined within this function, one should call the :class:
Module
instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.Expand source code
def forward(self, x): return self.linear(x)