Grid Optimizer¶

How to perform cross validation and hiperparameter optimization with Grid search¶

If you have seen the tutorial Reuse Data you mai noticed that we've use a standard sklean optimizer for hyperparameter tuning. This is fine for many uses cases, but it might not be the best choice for somo others.

We will use a simple pipeline for the iris dataset.¶

In [ ]:

Copied!

from labchain.utils.patch_type_guard import patch_inspect_for_notebooks

patch_inspect_for_notebooks()
from labchain.utils.patch_type_guard import patch_inspect_for_notebooks

patch_inspect_for_notebooks()

✅ Patched inspect.getsource using dill.

In [ ]:

Copied!





from sklearn import datasets
from labchain.base.base_clases import XYData

iris = datasets.load_iris()


X_train, X_test, y_train, y_test = XYData(
    _hash="Iris ", _path="/dataset", _value=[]
).train_test_split(
    iris.data,
    iris.target,
    test_size=0.2,
    random_state=42,  # type: ignore
)
from sklearn import datasets
from labchain.base.base_clases import XYData

iris = datasets.load_iris()


X_train, X_test, y_train, y_test = XYData(
    _hash="Iris ", _path="/dataset", _value=[]
).train_test_split(
    iris.data,
    iris.target,
    test_size=0.2,
    random_state=42,  # type: ignore
)

Then we will configure Grid Search for hyperparameter tuning and a Sklearn splitter for cross validation.¶

In [ ]:

Copied!





from labchain import (
    F1,
    Cached,
    F3Pipeline,
    KnnFilter,
    Precission,
    StandardScalerPlugin,
)
from labchain.plugins.metrics.classification import Recall, XYData
from labchain.plugins.optimizer.grid_optimizer import GridOptimizer
from labchain.plugins.splitter.cross_validation_splitter import KFoldSplitter


wandb_pipeline = (
    F3Pipeline(
        filters=[
            Cached(StandardScalerPlugin()),
            KnnFilter().grid({"n_neighbors": [2, 3, 4, 5, 6]}),
        ],
        metrics=[F1(), Precission(), Recall()],
    )
    .splitter(
        KFoldSplitter(
            n_splits=2,
            shuffle=True,
            random_state=42,
        )
    )
    .optimizer(GridOptimizer(scorer=F1()))
)
from labchain import (
    F1,
    Cached,
    F3Pipeline,
    KnnFilter,
    Precission,
    StandardScalerPlugin,
)
from labchain.plugins.metrics.classification import Recall, XYData
from labchain.plugins.optimizer.grid_optimizer import GridOptimizer
from labchain.plugins.splitter.cross_validation_splitter import KFoldSplitter


wandb_pipeline = (
    F3Pipeline(
        filters=[
            Cached(StandardScalerPlugin()),
            KnnFilter().grid({"n_neighbors": [2, 3, 4, 5, 6]}),
        ],
        metrics=[F1(), Precission(), Recall()],
    )
    .splitter(
        KFoldSplitter(
            n_splits=2,
            shuffle=True,
            random_state=42,
        )
    )
    .optimizer(GridOptimizer(scorer=F1()))
)

In [4]:

Copied!

wandb_pipeline.fit(X_train, y_train)
_y = wandb_pipeline.predict(x=X_test)
wandb_pipeline.fit(X_train, y_train)
_y = wandb_pipeline.predict(x=X_test)

{'KnnFilter': {'n_neighbors': [2, 3, 4, 5, 6]}}

____________________________________________________________________________________________________
Fitting pipeline...
****************************************************************************************************

Cached(filter=StandardScalerPlugin(), cache_data=True, cache_filter=True, overwrite=False, storage=None)

         - El filtro StandardScalerPlugin({}) Existe, se carga del storage.

         - El dato XYData(_hash='8cf971f3f80c552a61c115451c6de5ed26ac6c9d', 
_path='StandardScalerPlugin/0f98887c2bd6020b824a410979d85cbf1d8ebfd4') Existe, se carga del storage.

KnnFilter(
    n_neighbors=4,
    weights='uniform',
    algorithm='auto',
    leaf_size=30,
    p=2,
    metric='minkowski',
    metric_params=None,
    n_jobs=None
)

____________________________________________________________________________________________________
Predicting with KFold Splitter......
****************************************************************************************************

F3Pipeline(
    filters=[
        Cached(filter=StandardScalerPlugin(), cache_data=True, cache_filter=True, overwrite=False, storage=None),
        KnnFilter(
            n_neighbors=4,
            weights='uniform',
            algorithm='auto',
            leaf_size=30,
            p=2,
            metric='minkowski',
            metric_params=None,
            n_jobs=None
        )
    ],
    metrics=[F1(average='weighted'), Precission(average='weighted'), Recall(average='weighted')],
    overwrite=False,
    store=False,
    log=False
)

____________________________________________________________________________________________________
Predicting pipeline...
****************************************************************************************************

Cached(filter=StandardScalerPlugin(), cache_data=True, cache_filter=True, overwrite=False, storage=None)

         - El dato XYData(_hash='b4ff2a642069bfc672713ea400d29c66ecf21d93', 
_path='StandardScalerPlugin/0f98887c2bd6020b824a410979d85cbf1d8ebfd4') Existe, se carga del storage.

KnnFilter(
    n_neighbors=4,
    weights='uniform',
    algorithm='auto',
    leaf_size=30,
    p=2,
    metric='minkowski',
    metric_params=None,
    n_jobs=None
)

In [5]:

Copied!

y_test.value
y_test.value

Out[5]:

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [6]:

Copied!

_y.value
_y.value

Out[6]:

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [7]:

Copied!

wandb_pipeline.evaluate(X_test, y_test, _y)
wandb_pipeline.evaluate(X_test, y_test, _y)

____________________________________________________________________________________________________
Evaluating pipeline......
****************************************************************************************************

Out[7]:

{'F1': 1.0, 'Precission': 1.0, 'Recall': 1.0}

grid results¶

In [8]:

Copied!

wandb_pipeline._results
wandb_pipeline._results

Out[8]:

	KnnFilter	score
2	{'n_neighbors': 4}	0.933723
4	{'n_neighbors': 6}	0.932844
1	{'n_neighbors': 3}	0.925411
3	{'n_neighbors': 5}	0.916946
0	{'n_neighbors': 2}	0.908650