Module imodelsx.linear_ngram
Simple scikit-learn interface for finetuning a single linear layer on top of LLM embeddings.
Expand source code
"""
Simple scikit-learn interface for finetuning a single linear layer on top of LLM embeddings.
"""
from numpy.typing import ArrayLike
import numpy as np
from scipy.special import softmax
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.linear_model import LogisticRegressionCV, RidgeCV
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_is_fitted
# from spacy.lang.en import English
from scipy.sparse import issparse
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import transformers
from tqdm import tqdm
import os
import os.path
import warnings
import pickle as pkl
import torch
from sklearn.exceptions import ConvergenceWarning
device = "cuda" if torch.cuda.is_available() else "cpu"
class LinearNgram(BaseEstimator):
def __init__(
self,
checkpoint: str = "tfidfvectorizer",
tokenizer=None,
ngrams=2,
all_ngrams=True,
random_state=None,
):
"""LinearNgram Class - use either LinearNgramClassifier or LinearNgramRegressor rather than initializing this class directly.
Parameters
----------
checkpoint: str
Name of vectorizer checkpoint: "countvectorizer" or "tfidfvectorizer"
ngrams
Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc.
all_ngrams
Whether to use all order ngrams <= ngrams argument
random_state
random seed for fitting
Example
-------
```
from imodelsx import LinearNgramClassifier
import datasets
import numpy as np
# load data
dset = datasets.load_dataset('rotten_tomatoes')['train']
dset = dset.select(np.random.choice(len(dset), size=300, replace=False))
dset_val = datasets.load_dataset('rotten_tomatoes')['validation']
dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False))
# fit a simple ngram model
m = LinearNgramClassifier()
m.fit(dset['text'], dset['label'])
preds = m.predict(dset_val['text'])
acc = (preds == dset_val['label']).mean()
print('validation acc', acc)
```
"""
assert checkpoint in ["countvectorizer", "tfidfvectorizer"]
self.checkpoint = checkpoint
self.tokenizer = tokenizer
self.ngrams = ngrams
self.all_ngrams = all_ngrams
self.random_state = random_state
def fit(
self,
X_text: ArrayLike,
y: ArrayLike,
verbose=True,
):
"""Extract embeddings then fit linear model
Parameters
----------
X_text: ArrayLike[str]
y: ArrayLike[str]
"""
# metadata
if isinstance(self, ClassifierMixin):
self.classes_ = unique_labels(y)
if self.random_state is not None:
np.random.seed(self.random_state)
# set up model
if verbose:
print("initializing model...")
# get embs
if verbose:
print("calculating embeddings...")
if self.all_ngrams:
lower_ngram = 1
else:
lower_ngram = self.ngrams
# get vectorizer
if self.checkpoint == "countvectorizer":
self.vectorizer = CountVectorizer(
tokenizer=self.tokenizer, ngram_range=(
lower_ngram, self.ngrams)
)
elif self.checkpoint == "tfidfvectorizer":
self.vectorizer = TfidfVectorizer(
tokenizer=self.tokenizer, ngram_range=(
lower_ngram, self.ngrams)
)
# get embs
embs = self.vectorizer.fit_transform(X_text)
# train linear
warnings.filterwarnings("ignore", category=ConvergenceWarning)
if verbose:
print("training linear model...")
if isinstance(self, ClassifierMixin):
self.linear = LogisticRegressionCV()
elif isinstance(self, RegressorMixin):
self.linear = RidgeCV()
self.linear.fit(embs, y)
return self
def predict(self, X_text):
"""For regression returns continuous output.
For classification, returns discrete output.
"""
check_is_fitted(self)
embs = self.vectorizer.transform(X_text)
return self.linear.predict(embs)
def predict_proba(self, X_text):
check_is_fitted(self)
embs = self.vectorizer.transform(X_text)
return self.linear.predict_proba(embs)
class LinearNgramRegressor(LinearNgram, RegressorMixin):
...
class LinearNgramClassifier(LinearNgram, ClassifierMixin):
...
if __name__ == "__main__":
import imodelsx.data
dset, k = imodelsx.data.load_huggingface_dataset(
"rotten_tomatoes", binary_classification=False, subsample_frac=0.1
)
print(dset)
print(dset["train"])
print(np.unique(dset["train"]["label"]))
clf = LinearNgramClassifier()
clf.fit(dset["train"]["text"], dset["train"]["label"])
print("predicting")
preds = clf.predict(dset["test"]["text"])
print(preds.shape)
print("predicting proba")
preds_proba = clf.predict_proba(dset["test"]["text"])
print(preds_proba.shape)
assert preds_proba.shape[0] == preds.shape[0]
print(
"acc_train",
np.mean(clf.predict(dset["train"]["text"]) == dset["train"]["label"]),
)
print("acc_test", np.mean(preds == dset["test"]["label"]))
Classes
class LinearNgram (checkpoint: str = 'tfidfvectorizer', tokenizer=None, ngrams=2, all_ngrams=True, random_state=None)
-
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Notes
All estimators should specify all the parameters that can be set at the class level in their
__init__
as explicit keyword arguments (no*args
or**kwargs
).Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator >>> class MyEstimator(BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=2) >>> estimator.get_params() {'param': 2} >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([2, 2, 2]) >>> estimator.set_params(param=3).fit(X, y).predict(X) array([3, 3, 3])
LinearNgram Class - use either LinearNgramClassifier or LinearNgramRegressor rather than initializing this class directly.
Parameters
checkpoint
:str
- Name of vectorizer checkpoint: "countvectorizer" or "tfidfvectorizer"
ngrams
- Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc.
all_ngrams
- Whether to use all order ngrams <= ngrams argument
random_state
- random seed for fitting
Example
from imodelsx import LinearNgramClassifier import datasets import numpy as np # load data dset = datasets.load_dataset('rotten_tomatoes')['train'] dset = dset.select(np.random.choice(len(dset), size=300, replace=False)) dset_val = datasets.load_dataset('rotten_tomatoes')['validation'] dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False)) # fit a simple ngram model m = LinearNgramClassifier() m.fit(dset['text'], dset['label']) preds = m.predict(dset_val['text']) acc = (preds == dset_val['label']).mean() print('validation acc', acc)
Expand source code
class LinearNgram(BaseEstimator): def __init__( self, checkpoint: str = "tfidfvectorizer", tokenizer=None, ngrams=2, all_ngrams=True, random_state=None, ): """LinearNgram Class - use either LinearNgramClassifier or LinearNgramRegressor rather than initializing this class directly. Parameters ---------- checkpoint: str Name of vectorizer checkpoint: "countvectorizer" or "tfidfvectorizer" ngrams Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc. all_ngrams Whether to use all order ngrams <= ngrams argument random_state random seed for fitting Example ------- ``` from imodelsx import LinearNgramClassifier import datasets import numpy as np # load data dset = datasets.load_dataset('rotten_tomatoes')['train'] dset = dset.select(np.random.choice(len(dset), size=300, replace=False)) dset_val = datasets.load_dataset('rotten_tomatoes')['validation'] dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False)) # fit a simple ngram model m = LinearNgramClassifier() m.fit(dset['text'], dset['label']) preds = m.predict(dset_val['text']) acc = (preds == dset_val['label']).mean() print('validation acc', acc) ``` """ assert checkpoint in ["countvectorizer", "tfidfvectorizer"] self.checkpoint = checkpoint self.tokenizer = tokenizer self.ngrams = ngrams self.all_ngrams = all_ngrams self.random_state = random_state def fit( self, X_text: ArrayLike, y: ArrayLike, verbose=True, ): """Extract embeddings then fit linear model Parameters ---------- X_text: ArrayLike[str] y: ArrayLike[str] """ # metadata if isinstance(self, ClassifierMixin): self.classes_ = unique_labels(y) if self.random_state is not None: np.random.seed(self.random_state) # set up model if verbose: print("initializing model...") # get embs if verbose: print("calculating embeddings...") if self.all_ngrams: lower_ngram = 1 else: lower_ngram = self.ngrams # get vectorizer if self.checkpoint == "countvectorizer": self.vectorizer = CountVectorizer( tokenizer=self.tokenizer, ngram_range=( lower_ngram, self.ngrams) ) elif self.checkpoint == "tfidfvectorizer": self.vectorizer = TfidfVectorizer( tokenizer=self.tokenizer, ngram_range=( lower_ngram, self.ngrams) ) # get embs embs = self.vectorizer.fit_transform(X_text) # train linear warnings.filterwarnings("ignore", category=ConvergenceWarning) if verbose: print("training linear model...") if isinstance(self, ClassifierMixin): self.linear = LogisticRegressionCV() elif isinstance(self, RegressorMixin): self.linear = RidgeCV() self.linear.fit(embs, y) return self def predict(self, X_text): """For regression returns continuous output. For classification, returns discrete output. """ check_is_fitted(self) embs = self.vectorizer.transform(X_text) return self.linear.predict(embs) def predict_proba(self, X_text): check_is_fitted(self) embs = self.vectorizer.transform(X_text) return self.linear.predict_proba(embs)
Ancestors
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
Subclasses
Methods
def fit(self, X_text: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], y: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], verbose=True)
-
Extract embeddings then fit linear model
Parameters
X_text
:ArrayLike[str]
y
:ArrayLike[str]
Expand source code
def fit( self, X_text: ArrayLike, y: ArrayLike, verbose=True, ): """Extract embeddings then fit linear model Parameters ---------- X_text: ArrayLike[str] y: ArrayLike[str] """ # metadata if isinstance(self, ClassifierMixin): self.classes_ = unique_labels(y) if self.random_state is not None: np.random.seed(self.random_state) # set up model if verbose: print("initializing model...") # get embs if verbose: print("calculating embeddings...") if self.all_ngrams: lower_ngram = 1 else: lower_ngram = self.ngrams # get vectorizer if self.checkpoint == "countvectorizer": self.vectorizer = CountVectorizer( tokenizer=self.tokenizer, ngram_range=( lower_ngram, self.ngrams) ) elif self.checkpoint == "tfidfvectorizer": self.vectorizer = TfidfVectorizer( tokenizer=self.tokenizer, ngram_range=( lower_ngram, self.ngrams) ) # get embs embs = self.vectorizer.fit_transform(X_text) # train linear warnings.filterwarnings("ignore", category=ConvergenceWarning) if verbose: print("training linear model...") if isinstance(self, ClassifierMixin): self.linear = LogisticRegressionCV() elif isinstance(self, RegressorMixin): self.linear = RidgeCV() self.linear.fit(embs, y) return self
def predict(self, X_text)
-
For regression returns continuous output. For classification, returns discrete output.
Expand source code
def predict(self, X_text): """For regression returns continuous output. For classification, returns discrete output. """ check_is_fitted(self) embs = self.vectorizer.transform(X_text) return self.linear.predict(embs)
def predict_proba(self, X_text)
-
Expand source code
def predict_proba(self, X_text): check_is_fitted(self) embs = self.vectorizer.transform(X_text) return self.linear.predict_proba(embs)
def set_fit_request(self: LinearNgram, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$', verbose: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearNgram
-
Request metadata passed to the
fit
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed tofit
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it tofit
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
X_text
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
X_text
parameter infit
. verbose
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
verbose
parameter infit
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
def set_predict_proba_request(self: LinearNgram, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearNgram
-
Request metadata passed to the
predict_proba
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed topredict_proba
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it topredict_proba
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
X_text
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
X_text
parameter inpredict_proba
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
def set_predict_request(self: LinearNgram, *, X_text: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearNgram
-
Request metadata passed to the
predict
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed topredict
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it topredict
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
X_text
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
X_text
parameter inpredict
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
- setting and getting parameters used by
class LinearNgramClassifier (checkpoint: str = 'tfidfvectorizer', tokenizer=None, ngrams=2, all_ngrams=True, random_state=None)
-
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Notes
All estimators should specify all the parameters that can be set at the class level in their
__init__
as explicit keyword arguments (no*args
or**kwargs
).Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator >>> class MyEstimator(BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=2) >>> estimator.get_params() {'param': 2} >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([2, 2, 2]) >>> estimator.set_params(param=3).fit(X, y).predict(X) array([3, 3, 3])
LinearNgram Class - use either LinearNgramClassifier or LinearNgramRegressor rather than initializing this class directly.
Parameters
checkpoint
:str
- Name of vectorizer checkpoint: "countvectorizer" or "tfidfvectorizer"
ngrams
- Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc.
all_ngrams
- Whether to use all order ngrams <= ngrams argument
random_state
- random seed for fitting
Example
from imodelsx import LinearNgramClassifier import datasets import numpy as np # load data dset = datasets.load_dataset('rotten_tomatoes')['train'] dset = dset.select(np.random.choice(len(dset), size=300, replace=False)) dset_val = datasets.load_dataset('rotten_tomatoes')['validation'] dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False)) # fit a simple ngram model m = LinearNgramClassifier() m.fit(dset['text'], dset['label']) preds = m.predict(dset_val['text']) acc = (preds == dset_val['label']).mean() print('validation acc', acc)
Expand source code
class LinearNgramClassifier(LinearNgram, ClassifierMixin): ...
Ancestors
- LinearNgram
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.ClassifierMixin
Methods
def set_score_request(self: LinearNgramClassifier, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearNgramClassifier
-
Request metadata passed to the
score
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed toscore
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it toscore
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
sample_weight
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
sample_weight
parameter inscore
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
Inherited members
- setting and getting parameters used by
class LinearNgramRegressor (checkpoint: str = 'tfidfvectorizer', tokenizer=None, ngrams=2, all_ngrams=True, random_state=None)
-
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:
User Guide <rolling_your_own_estimator>
.Notes
All estimators should specify all the parameters that can be set at the class level in their
__init__
as explicit keyword arguments (no*args
or**kwargs
).Examples
>>> import numpy as np >>> from sklearn.base import BaseEstimator >>> class MyEstimator(BaseEstimator): ... def __init__(self, *, param=1): ... self.param = param ... def fit(self, X, y=None): ... self.is_fitted_ = True ... return self ... def predict(self, X): ... return np.full(shape=X.shape[0], fill_value=self.param) >>> estimator = MyEstimator(param=2) >>> estimator.get_params() {'param': 2} >>> X = np.array([[1, 2], [2, 3], [3, 4]]) >>> y = np.array([1, 0, 1]) >>> estimator.fit(X, y).predict(X) array([2, 2, 2]) >>> estimator.set_params(param=3).fit(X, y).predict(X) array([3, 3, 3])
LinearNgram Class - use either LinearNgramClassifier or LinearNgramRegressor rather than initializing this class directly.
Parameters
checkpoint
:str
- Name of vectorizer checkpoint: "countvectorizer" or "tfidfvectorizer"
ngrams
- Order of ngrams to extract. 1 for unigrams, 2 for bigrams, etc.
all_ngrams
- Whether to use all order ngrams <= ngrams argument
random_state
- random seed for fitting
Example
from imodelsx import LinearNgramClassifier import datasets import numpy as np # load data dset = datasets.load_dataset('rotten_tomatoes')['train'] dset = dset.select(np.random.choice(len(dset), size=300, replace=False)) dset_val = datasets.load_dataset('rotten_tomatoes')['validation'] dset_val = dset_val.select(np.random.choice(len(dset_val), size=300, replace=False)) # fit a simple ngram model m = LinearNgramClassifier() m.fit(dset['text'], dset['label']) preds = m.predict(dset_val['text']) acc = (preds == dset_val['label']).mean() print('validation acc', acc)
Expand source code
class LinearNgramRegressor(LinearNgram, RegressorMixin): ...
Ancestors
- LinearNgram
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.RegressorMixin
Methods
def set_score_request(self: LinearNgramRegressor, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> LinearNgramRegressor
-
Request metadata passed to the
score
method.Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
). Please see :ref:User Guide <metadata_routing>
on how the routing mechanism works.The options for each parameter are:
-
True
: metadata is requested, and passed toscore
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it toscore
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (
sklearn.utils.metadata_routing.UNCHANGED
) retains the existing request. This allows you to change the request for some parameters and not others.Added in version: 1.3
Note
This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:
~sklearn.pipeline.Pipeline
. Otherwise it has no effect.Parameters
sample_weight
:str, True, False,
orNone
, default=sklearn.utils.metadata_routing.UNCHANGED
- Metadata routing for
sample_weight
parameter inscore
.
Returns
self
:object
- The updated object.
Expand source code
def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is enabled." " You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" f" are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance
-
Inherited members
- setting and getting parameters used by