If you have seen the tutorial Reuse Data you mai noticed that we've use a standard sklean optimizer for hyperparameter tuning. This is fine for many uses cases, but it might not be the best choice for somo others.
We will use a simple pipeline for the iris dataset.¶
In [ ]:
Copied!
from labchain.utils.patch_type_guard import patch_inspect_for_notebooks
patch_inspect_for_notebooks()
from labchain.utils.patch_type_guard import patch_inspect_for_notebooks
patch_inspect_for_notebooks()
✅ Patched inspect.getsource using dill.
In [ ]:
Copied!
from sklearn import datasets
from labchain.base.base_clases import XYData
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = XYData(
_hash="Iris ", _path="/dataset", _value=[]
).train_test_split(
iris.data,
iris.target,
test_size=0.2,
random_state=42, # type: ignore
)
from sklearn import datasets
from labchain.base.base_clases import XYData
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = XYData(
_hash="Iris ", _path="/dataset", _value=[]
).train_test_split(
iris.data,
iris.target,
test_size=0.2,
random_state=42, # type: ignore
)
Then we will configure Grid Search for hyperparameter tuning and a Sklearn splitter for cross validation.¶
In [ ]:
Copied!
from labchain import (
F1,
Cached,
F3Pipeline,
KnnFilter,
Precission,
StandardScalerPlugin,
)
from labchain.plugins.metrics.classification import Recall, XYData
from labchain.plugins.optimizer.grid_optimizer import GridOptimizer
from labchain.plugins.splitter.cross_validation_splitter import KFoldSplitter
wandb_pipeline = (
F3Pipeline(
filters=[
Cached(StandardScalerPlugin()),
KnnFilter().grid({"n_neighbors": [2, 3, 4, 5, 6]}),
],
metrics=[F1(), Precission(), Recall()],
)
.splitter(
KFoldSplitter(
n_splits=2,
shuffle=True,
random_state=42,
)
)
.optimizer(GridOptimizer(scorer=F1()))
)
from labchain import (
F1,
Cached,
F3Pipeline,
KnnFilter,
Precission,
StandardScalerPlugin,
)
from labchain.plugins.metrics.classification import Recall, XYData
from labchain.plugins.optimizer.grid_optimizer import GridOptimizer
from labchain.plugins.splitter.cross_validation_splitter import KFoldSplitter
wandb_pipeline = (
F3Pipeline(
filters=[
Cached(StandardScalerPlugin()),
KnnFilter().grid({"n_neighbors": [2, 3, 4, 5, 6]}),
],
metrics=[F1(), Precission(), Recall()],
)
.splitter(
KFoldSplitter(
n_splits=2,
shuffle=True,
random_state=42,
)
)
.optimizer(GridOptimizer(scorer=F1()))
)
In [4]:
Copied!
wandb_pipeline.fit(X_train, y_train)
_y = wandb_pipeline.predict(x=X_test)
wandb_pipeline.fit(X_train, y_train)
_y = wandb_pipeline.predict(x=X_test)
{'KnnFilter': {'n_neighbors': [2, 3, 4, 5, 6]}}
____________________________________________________________________________________________________
Fitting pipeline...
****************************************************************************************************
Cached(filter=StandardScalerPlugin(), cache_data=True, cache_filter=True, overwrite=False, storage=None)
- El filtro StandardScalerPlugin({}) Existe, se carga del storage.
- El dato XYData(_hash='8cf971f3f80c552a61c115451c6de5ed26ac6c9d', _path='StandardScalerPlugin/0f98887c2bd6020b824a410979d85cbf1d8ebfd4') Existe, se carga del storage.
KnnFilter( n_neighbors=4, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None )
____________________________________________________________________________________________________
Predicting with KFold Splitter......
****************************************************************************************************
F3Pipeline( filters=[ Cached(filter=StandardScalerPlugin(), cache_data=True, cache_filter=True, overwrite=False, storage=None), KnnFilter( n_neighbors=4, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None ) ], metrics=[F1(average='weighted'), Precission(average='weighted'), Recall(average='weighted')], overwrite=False, store=False, log=False )
____________________________________________________________________________________________________
Predicting pipeline...
****************************************************************************************************
Cached(filter=StandardScalerPlugin(), cache_data=True, cache_filter=True, overwrite=False, storage=None)
- El dato XYData(_hash='b4ff2a642069bfc672713ea400d29c66ecf21d93', _path='StandardScalerPlugin/0f98887c2bd6020b824a410979d85cbf1d8ebfd4') Existe, se carga del storage.
KnnFilter( n_neighbors=4, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None )
In [5]:
Copied!
y_test.value
y_test.value
Out[5]:
array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
0, 2, 2, 2, 2, 2, 0, 0])
In [6]:
Copied!
_y.value
_y.value
Out[6]:
array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
0, 2, 2, 2, 2, 2, 0, 0])
In [7]:
Copied!
wandb_pipeline.evaluate(X_test, y_test, _y)
wandb_pipeline.evaluate(X_test, y_test, _y)
____________________________________________________________________________________________________
Evaluating pipeline......
****************************************************************************************************
Out[7]:
{'F1': 1.0, 'Precission': 1.0, 'Recall': 1.0}
grid results¶
In [8]:
Copied!
wandb_pipeline._results
wandb_pipeline._results
Out[8]:
| KnnFilter | score | |
|---|---|---|
| 2 | {'n_neighbors': 4} | 0.933723 |
| 4 | {'n_neighbors': 6} | 0.932844 |
| 1 | {'n_neighbors': 3} | 0.925411 |
| 3 | {'n_neighbors': 5} | 0.916946 |
| 0 | {'n_neighbors': 2} | 0.908650 |