diff --git a/docs/source/notebooks/pca_gp_plaid_pipeline.ipynb b/docs/source/notebooks/pca_gp_plaid_pipeline.ipynb new file mode 100644 index 0000000..5f37fdd --- /dev/null +++ b/docs/source/notebooks/pca_gp_plaid_pipeline.ipynb @@ -0,0 +1,2711 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exemple of pipeline PCA-GP-PCA type" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_regression\n", + "\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "from sklearn.gaussian_process import GaussianProcessRegressor\n", + "\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer, TransformedTargetRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from plaid.containers.dataset import Dataset\n", + "from plaid.containers.sample import Sample\n", + "from plaid.problem_definition import ProblemDefinition\n", + "from plaid.wrappers.sklearn import WrappedSklearnTransform, WrappedSklearnRegressor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate some synthetic regression data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "NB_SAMPLES = 103\n", + "NB_INPUT_SCALARS = 3\n", + "NB_OUTPUT_SCALARS = 5\n", + "FIELD_SIZE = 17\n", + "\n", + "X, y = make_regression(n_samples=NB_SAMPLES, n_features=NB_INPUT_SCALARS, n_targets=NB_OUTPUT_SCALARS + FIELD_SIZE, noise=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n", + " 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,\n", + " 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,\n", + " 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,\n", + " 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,\n", + " 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,\n", + " 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,\n", + " 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset = Dataset()\n", + "samples = []\n", + "for sample_id in range(NB_SAMPLES):\n", + " sample = Sample()\n", + " for scalar_id in range(NB_INPUT_SCALARS):\n", + " sample.add_scalar(f\"input_scalar_{scalar_id}\", X[sample_id, scalar_id])\n", + " for scalar_id in range(NB_OUTPUT_SCALARS):\n", + " sample.add_scalar(f\"output_scalar_{scalar_id}\", y[sample_id, scalar_id])\n", + " sample.init_base(topological_dim=3, physical_dim=3)\n", + " sample.init_zone(np.array([0,0,0]))\n", + " sample.add_field(f\"output_field\", y[sample_id, NB_OUTPUT_SCALARS:])\n", + " samples.append(sample)\n", + "dset.add_samples(samples)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# pr_def = ProblemDefinition()\n", + "# pr_def.set_split({\n", + "# 'train': np.arange(NB_SAMPLES * 0.8),\n", + "# 'test': np.arange(NB_SAMPLES * 0.8,NB_SAMPLES),\n", + "# })\n", + "# pr_def.add_input_scalars_names([f\"input_scalar_{scalar_id}\" for scalar_id in range(NB_OUTPUT_SCALARS)])\n", + "# pr_def.add_output_scalars_names([f\"output_scalar_{scalar_id}\" for scalar_id in range(NB_OUTPUT_SCALARS)])\n", + "# pr_def.add_output_fields_names(['output_field'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PCA-GP-PCA as a pipeline with PLAID inputs/outputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Define the PCA for the shape embedding\n", + "\n", + "In this example we only apply PCA to the scalars, other features are not modified" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
WrappedSklearnTransform(PCA(n_components=2))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "WrappedSklearnTransform(PCA(n_components=2))" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### old version\n", + "# feats_to_reduce = list(range(NB_INPUT_SCALARS))\n", + "# preprocessor = ColumnTransformer(\n", + "# transformers=[\n", + "# (\n", + "# \"pca\",\n", + "# WrappedSklearnTransform(PCA(n_components=2), in_features=['scalar::all']),\n", + "# feats_to_reduce,\n", + "# ),\n", + "# ],\n", + "# remainder=\"passthrough\",\n", + "# )\n", + "\n", + "### new version\n", + "preprocessor = WrappedSklearnTransform(PCA(n_components=2), in_features=[f\"scalar::input_scalar_{scalar_id}\" for scalar_id in range(NB_INPUT_SCALARS)])\n", + "preprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
WrappedSklearnTransform(PCA(n_components=2))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "WrappedSklearnTransform(PCA(n_components=2))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessor.fit(dataset=dset)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessor._is_fitted" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== In <_convert_y_to_plaid>\n", + "- self.sklearn_block.feature_names_in_ not found\n", + "- self.sklearn_block.get_feature_names_out()=array(['pca0', 'pca1'], dtype=object)\n", + "- self.output_scalars=[]\n", + "- self.output_time_series=[]\n", + "- self.output_fields=[]\n", + "- dataset.get_scalar_names()=['input_scalar_0', 'input_scalar_1', 'input_scalar_2', 'output_scalar_0', 'output_scalar_1', 'output_scalar_2', 'output_scalar_3', 'output_scalar_4']\n", + "- dataset.get_time_series_names()=[]\n", + "- dataset.get_field_names()=['output_field']\n", + "- y.shape=(103, 2)\n", + "- dataset.get_scalar_names()=['input_scalar_0', 'input_scalar_1', 'input_scalar_2', 'output_scalar_0', 'output_scalar_1', 'output_scalar_2', 'output_scalar_3', 'output_scalar_4']\n", + "- dataset.get_time_series_names()=[]\n", + "- dataset.get_field_names()=['output_field']\n" + ] + }, + { + "data": { + "text/plain": [ + "(True,\n", + " ['input_scalar_0',\n", + " 'input_scalar_1',\n", + " 'input_scalar_2',\n", + " 'output_scalar_0',\n", + " 'output_scalar_1',\n", + " 'output_scalar_2',\n", + " 'output_scalar_3',\n", + " 'output_scalar_4'],\n", + " ['output_field'])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out_dset = preprocessor.transform(dset)\n", + "id(dset) == id(out_dset), out_dset.get_scalar_names(), out_dset.get_field_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(preprocessor.sklearn_block.explained_variance_ratio_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Define the output scaler for the output fields (MinMaxScaler + PCA)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('scaler',\n",
+       "                 WrappedSklearnTransform(in_features=['field::output_field'], sklearn_block=MinMaxScaler())),\n",
+       "                ('pca',\n",
+       "                 WrappedSklearnTransform(in_features=['field::output_field'], sklearn_block=PCA(n_components=9)))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('scaler',\n", + " WrappedSklearnTransform(in_features=['field::output_field'], sklearn_block=MinMaxScaler())),\n", + " ('pca',\n", + " WrappedSklearnTransform(in_features=['field::output_field'], sklearn_block=PCA(n_components=9)))])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "postprocessor = Pipeline(\n", + " [\n", + " (\"scaler\", WrappedSklearnTransform(MinMaxScaler(), in_features=[\"field::output_field\"])),\n", + " (\"pca\", WrappedSklearnTransform(PCA(n_components=9), in_features=[\"field::output_field\"])),\n", + " ]\n", + ")\n", + "postprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== In <_convert_y_to_plaid>\n", + "- self.sklearn_block.feature_names_in_ not found\n", + "- self.sklearn_block.get_feature_names_out()=array(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',\n", + " 'x11', 'x12', 'x13', 'x14', 'x15', 'x16'], dtype=object)\n", + "- self.output_scalars=[]\n", + "- self.output_time_series=[]\n", + "- self.output_fields=[]\n", + "- dataset.get_scalar_names()=['input_scalar_0', 'input_scalar_1', 'input_scalar_2', 'output_scalar_0', 'output_scalar_1', 'output_scalar_2', 'output_scalar_3', 'output_scalar_4']\n", + "- dataset.get_time_series_names()=[]\n", + "- dataset.get_field_names()=['output_field']\n", + "- y.shape=(103, 17)\n", + "- dataset.get_scalar_names()=['input_scalar_0', 'input_scalar_1', 'input_scalar_2', 'output_scalar_0', 'output_scalar_1', 'output_scalar_2', 'output_scalar_3', 'output_scalar_4']\n", + "- dataset.get_time_series_names()=[]\n", + "- dataset.get_field_names()=['output_field']\n" + ] + }, + { + "ename": "TypeError", + "evalue": "PlaidWrapper.fit() takes 2 positional arguments but 3 were given", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpostprocessor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdset\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\d606912\\.conda\\envs\\plaid_dev\\Lib\\site-packages\\sklearn\\base.py:1363\u001b[39m, in \u001b[36m_fit_context..decorator..wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m 1356\u001b[39m estimator._validate_params()\n\u001b[32m 1358\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 1359\u001b[39m skip_parameter_validation=(\n\u001b[32m 1360\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 1361\u001b[39m )\n\u001b[32m 1362\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1363\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\d606912\\.conda\\envs\\plaid_dev\\Lib\\site-packages\\sklearn\\pipeline.py:661\u001b[39m, in \u001b[36mPipeline.fit\u001b[39m\u001b[34m(self, X, y, **params)\u001b[39m\n\u001b[32m 655\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._final_estimator != \u001b[33m\"\u001b[39m\u001b[33mpassthrough\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 656\u001b[39m last_step_params = \u001b[38;5;28mself\u001b[39m._get_metadata_for_step(\n\u001b[32m 657\u001b[39m step_idx=\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m) - \u001b[32m1\u001b[39m,\n\u001b[32m 658\u001b[39m step_params=routed_params[\u001b[38;5;28mself\u001b[39m.steps[-\u001b[32m1\u001b[39m][\u001b[32m0\u001b[39m]],\n\u001b[32m 659\u001b[39m all_params=params,\n\u001b[32m 660\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m661\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_final_estimator\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mXt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mlast_step_params\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfit\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 663\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", + "\u001b[31mTypeError\u001b[39m: PlaidWrapper.fit() takes 2 positional arguments but 3 were given" + ] + } + ], + "source": [ + "postprocessor.fit(dset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Define the regressor\n", + "\n", + "Y = GP(transformer(X)) where transformer(X) = postprocessor(X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "regressor = TransformedTargetRegressor(\n", + " regressor=WrappedSklearnRegressor(\n", + " GaussianProcessRegressor(n_restarts_optimizer=3),\n", + " in_features=[f\"scalar::input_scalar_{scalar_id}\" for scalar_id in range(NB_INPUT_SCALARS)],\n", + " out_featuresures=[\"field::output_field\", *[f\"scalar::output_scalar_{scalar_id}\" for scalar_id in range(NB_OUTPUT_SCALARS)]],\n", + " ),\n", + " check_inverse=False,\n", + " transformer=postprocessor,\n", + ")\n", + "regressor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Combine to make the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = Pipeline(\n", + " steps=[\n", + " (\"preprocessor\", preprocessor),\n", + " (\"scaler\", WrappedSklearnTransform(StandardScaler(), in_features=[f\"scalar::input_scalar_{scalar_id}\" for scalar_id in range(NB_INPUT_SCALARS)])),\n", + " (\"regressor\", regressor),\n", + " ]\n", + ")\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fit the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.fit(dset)\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predict on the training data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model.predict(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Other way to define the pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Define the regressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "regressor = Pipeline(\n", + " steps=[\n", + " (\"preprocessor\", preprocessor),\n", + " (\"scaler\", WrappedSklearnTransform(\n", + " StandardScaler(),\n", + " in_features=[f\"scalar::input_scalar_{scalar_id}\" for scalar_id in range(NB_INPUT_SCALARS)],\n", + " )),\n", + " (\"regressor\", WrappedSklearnRegressor(\n", + " GaussianProcessRegressor(n_restarts_optimizer=3),\n", + " in_features=[f\"scalar::input_scalar_{scalar_id}\" for scalar_id in range(NB_INPUT_SCALARS)],\n", + " out_featuresures=[\"field::output_field\", *[f\"scalar::output_scalar_{scalar_id}\" for scalar_id in range(NB_OUTPUT_SCALARS)]],\n", + " )),\n", + " ]\n", + ")\n", + "regressor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Combine to make the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = TransformedTargetRegressor(\n", + " regressor=regressor,\n", + " check_inverse=False,\n", + " transformer=postprocessor,\n", + ")\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.fit(X,y)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "plaid_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/notebooks/pca_gp_sklearn_pipeline.ipynb b/docs/source/notebooks/pca_gp_sklearn_pipeline.ipynb new file mode 100644 index 0000000..f865796 --- /dev/null +++ b/docs/source/notebooks/pca_gp_sklearn_pipeline.ipynb @@ -0,0 +1,7899 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exemple of pipeline PCA-GP-PCA type" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_regression\n", + "\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "from sklearn.gaussian_process import GaussianProcessRegressor\n", + "\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer, TransformedTargetRegressor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate some synthetic regression data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = make_regression(n_samples=100, n_features=10, noise=0.1, n_targets=11)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PCA-GP-PCA as an sklearn pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Define the PCA for the shape embedding\n", + "\n", + "In this example we only apply PCA to the first 8 columns\n", + "\n", + "The last two columns are unchanged" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
ColumnTransformer(remainder='passthrough',\n",
+       "                  transformers=[('pca', PCA(n_components=8),\n",
+       "                                 [0, 1, 2, 3, 4, 5, 6, 7])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "ColumnTransformer(remainder='passthrough',\n", + " transformers=[('pca', PCA(n_components=8),\n", + " [0, 1, 2, 3, 4, 5, 6, 7])])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feats_to_reduce = list(range(8))\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\n", + " \"pca\",\n", + " PCA(n_components=8),\n", + " feats_to_reduce,\n", + " ),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "preprocessor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Define the output scaler for the output fields (MinMaxScaler + PCA)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('scaler', MinMaxScaler()), ('pca', PCA(n_components=9))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('scaler', MinMaxScaler()), ('pca', PCA(n_components=9))])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "postprocessor = Pipeline(\n", + " [\n", + " (\"scaler\", MinMaxScaler()),\n", + " (\"pca\", PCA(n_components=9)),\n", + " ]\n", + ")\n", + "postprocessor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Define the regressor\n", + "\n", + "Y = GP(transformer(X)) where transformer(X) = postprocessor(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
TransformedTargetRegressor(check_inverse=False,\n",
+       "                           regressor=GaussianProcessRegressor(n_restarts_optimizer=3),\n",
+       "                           transformer=Pipeline(steps=[('scaler',\n",
+       "                                                        MinMaxScaler()),\n",
+       "                                                       ('pca',\n",
+       "                                                        PCA(n_components=9))]))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "TransformedTargetRegressor(check_inverse=False,\n", + " regressor=GaussianProcessRegressor(n_restarts_optimizer=3),\n", + " transformer=Pipeline(steps=[('scaler',\n", + " MinMaxScaler()),\n", + " ('pca',\n", + " PCA(n_components=9))]))" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regressor = TransformedTargetRegressor(\n", + " regressor=GaussianProcessRegressor(\n", + " n_restarts_optimizer=3,\n", + " ),\n", + " check_inverse=False,\n", + " transformer=postprocessor,\n", + ")\n", + "regressor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Combine to make the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 ColumnTransformer(remainder='passthrough',\n",
+       "                                   transformers=[('pca', PCA(n_components=8),\n",
+       "                                                  [0, 1, 2, 3, 4, 5, 6, 7])])),\n",
+       "                ('scaler', StandardScaler()),\n",
+       "                ('regressor',\n",
+       "                 TransformedTargetRegressor(check_inverse=False,\n",
+       "                                            regressor=GaussianProcessRegressor(n_restarts_optimizer=3),\n",
+       "                                            transformer=Pipeline(steps=[('scaler',\n",
+       "                                                                         MinMaxScaler()),\n",
+       "                                                                        ('pca',\n",
+       "                                                                         PCA(n_components=9))])))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.