Skip to content

🎉 scikit-learn pipeline models in examples #101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ docs/source/notebooks/*.yaml
# Heavy data
*.cgns
!notebooks/ex_rotor37_pv.cgns
*artifact*

###############################################################################
# .gitignore PATTERN FORMAT:
Expand Down
57 changes: 57 additions & 0 deletions examples/pipelines/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
global:
dataset_path: "PLAID-datasets/VKI-LS59"
save_path: "./artifacts"
train_split_name: "train"
test_split_name: "test"

input_scalar_scaler:
type: "MinMaxScaler"
scalar_names:
- angle_in
- mach_out

output_scalar_scaler:
type: "MinMaxScaler"
scalar_names:
- Q
- power
- Pr
- Tr
- eth_is
- angle_out

pca_nodes:
base_name: Base_2_2
field_name: nodes
n_components: 3

pca_mach:
base_name: Base_2_2
field_name: mach
n_components: 5

pca_nut:
base_name: Base_2_2
field_name: nut
n_components: 8

regressor_mach:
type: "GaussianProcessRegressor"
options:
kernel: Matern
kernel_options:
nu: 2.5
optim: fmin_l_bfgs_b
num_restarts: 2
anisotropic: True
random_state: 42
show_warnings: False
input:
scalar_names:
- angle_in
- mach_out
vector_names:
- reduced_nodes
output:
vector_names:
- reduced_mach
262 changes: 262 additions & 0 deletions examples/pipelines/ml_pipeline_nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
# -*- coding: utf-8 -*-
#
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
#
#

import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.decomposition import PCA
from plaid.containers.dataset import Dataset
import copy

from sklearn.preprocessing import StandardScaler, MinMaxScaler

available_scalers = {
"StandardScaler": StandardScaler,
"MinMaxScaler": MinMaxScaler,
}

class ScalarScalerNode(BaseEstimator, TransformerMixin):

def __init__(self, params):

self.params = params

self.type_ = params['type']
self.scalar_names = params['scalar_names']

assert self.type_ in available_scalers.keys(), "Scaler "+self.type_+" not available"

self.model = None

def get_scalars(self, dataset):
if isinstance(dataset, list):
dataset = Dataset.from_list_of_samples(dataset)
return dataset.get_scalars_to_tabular(
scalar_names = self.scalar_names,
as_nparray = True
)

def set_scalars(self, dataset, scalars):
for i in range(len(dataset)):
for j, sn in enumerate(self.scalar_names):
dataset[i].add_scalar(sn, scalars[i, j])

def fit(self, dataset, y=None):
self.model = available_scalers[self.type_]()

scalars = self.get_scalars(dataset)
self.model.fit(scalars)
self.fitted_ = True
return self

def transform(self, dataset):
scalars = self.get_scalars(dataset)
scaled_scalars = self.model.transform(scalars)
dataset_ = copy.deepcopy(dataset)
self.set_scalars(dataset_, scaled_scalars)
return dataset_

def inverse_transform(self, dataset):
scaled_scalars = self.get_scalars(dataset)
scalars = self.model.inverse_transform(scaled_scalars)
dataset_ = copy.deepcopy(dataset)
self.set_scalars(dataset_, scalars)
return dataset_


class PCAEmbeddingNode(BaseEstimator, RegressorMixin, TransformerMixin):

def __init__(self, params, n_components = None):

self.params = params

self.n_components = n_components if n_components is not None else params['n_components']

self.field_name = params['field_name']
self.zone_name = params.get('zone_name')
self.base_name = params.get('base_name')
self.location = params.get('location', 'Vertex')

self.model = None

def get_all_fields(self, dataset):
all_fields = []
for sample in dataset:
for time in sample.get_all_mesh_times():
if self.field_name == "nodes":
field = sample.get_nodes(self.zone_name, self.base_name, time).flatten()
else:
field = sample.get_field(self.field_name, self.zone_name, self.base_name, self.location, time)
all_fields.append(field)
return np.array(all_fields)

def set_reduced_fields(self, dataset, reduced_fields):
for i in range(len(dataset)):
for j in range(self.n_components):
dataset[i].add_scalar(f"reduced_{self.field_name}_{j}", reduced_fields[i, j])

def get_reduced_fields(self, dataset):
if isinstance(dataset, list):
dataset = Dataset.from_list_of_samples(dataset)
return dataset.get_scalars_to_tabular(
scalar_names = [f"reduced_{self.field_name}_{j}" for j in range(self.n_components)],
as_nparray = True
)

def set_fields(self, dataset, fields): # TODO: this will not work with multiple times step per sample
for i in range(len(dataset)):
for time in dataset[i].get_all_mesh_times():
dataset[i].add_field(self.field_name, fields[i], self.zone_name, self.base_name, self.location, time)

def fit(self, dataset, y=None):
self.model = PCA(n_components = self.n_components)

all_fields = self.get_all_fields(dataset)
self.model.fit(all_fields)
self.fitted_ = True
return self

def transform(self, dataset):
all_fields = self.get_all_fields(dataset)
reduced_fields = self.model.transform(all_fields)
dataset_ = copy.deepcopy(dataset)
self.set_reduced_fields(dataset_, reduced_fields)
return dataset_

def inverse_transform(self, dataset):
reduced_fields = self.get_reduced_fields(dataset)
fields = self.model.inverse_transform(reduced_fields)
dataset_ = copy.deepcopy(dataset)
self.set_fields(dataset_, fields)
return dataset_


from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.multioutput import MultiOutputRegressor

available_kernel_classes = {
"Matern":Matern
}

class GPRegressorNode(BaseEstimator, RegressorMixin, TransformerMixin):

def __init__(self, params):

self.params = params

self.type_ = params['type']
self.input = params['input']
self.output = params['output']
self.options = params['options']


assert self.type_ == "GaussianProcessRegressor"
assert self.options['kernel'] in available_kernel_classes.keys(), "scikit-learn kernel "+self.options['kernel']+" not available"

self.model = None


def get_scalars(self, dataset):
if isinstance(dataset, list):
dataset = Dataset.from_list_of_samples(dataset)
return dataset.get_scalars_to_tabular(
scalar_names = self.input_names,
as_nparray = True
)

def fit(self, dataset, y=None):
if isinstance(dataset, list):
dataset = Dataset.from_list_of_samples(dataset)
all_available_scalar = dataset.get_scalar_names()

self.input_names = []
if "scalar_names" in self.input:
self.input_names += self.input["scalar_names"]
if "vector_names" in self.input:
for vn in self.input["vector_names"]:
self.input_names += [s for s in all_available_scalar if s.startswith(vn)]

self.output_names = []
if "scalar_names" in self.output:
self.output_names += self.output["scalar_names"]
if "vector_names" in self.output:
for vn in self.output["vector_names"]:
self.output_names += [s for s in all_available_scalar if s.startswith(vn)]

kernel_class = available_kernel_classes[self.options['kernel']]
if self.options["anisotropic"]:
kernel = ConstantKernel() * kernel_class(length_scale=np.ones(len(self.input_names)), length_scale_bounds=(1e-8, 1e8),
**self.options["kernel_options"]) + WhiteKernel(noise_level_bounds=(1e-8, 1e8))
else:
kernel = kernel_class(length_scale_bounds=(1e-8, 1e8), **self.options["kernel_options"]) \
+ WhiteKernel(noise_level_bounds=(1e-8, 1e8))

gpr = GaussianProcessRegressor(
kernel=kernel,
optimizer=self.options["optim"],
n_restarts_optimizer=self.options["num_restarts"],
random_state = self.options["random_state"])

self.model = MultiOutputRegressor(gpr)
if isinstance(dataset, list):
dataset = Dataset.from_list_of_samples(dataset)
X = dataset.get_scalars_to_tabular(
scalar_names = self.input_names,
as_nparray = True
)
y = dataset.get_scalars_to_tabular(
scalar_names = self.output_names,
as_nparray = True
)
self.model.fit(X, y)

self.fitted_ = True
return self

def predict(self, dataset):
if isinstance(dataset, list):
dataset = Dataset.from_list_of_samples(dataset)
X = dataset.get_scalars_to_tabular(
scalar_names = self.input_names,
as_nparray = True
)

pred= self.model.predict(X)
if len(self.output_names) == 1:
pred = pred.reshape((-1, 1))

dataset_ = copy.deepcopy(dataset)
for i in range(len(dataset)):
for j, sn in enumerate(self.output_names):
dataset_[i].add_scalar(sn, pred[i, j])

return dataset_

def transform(self, dataset):
return dataset

def inverse_transform(self, dataset):
return dataset

def score(self, dataset, dataset_ref):
if not dataset_ref:
# case where GirdSearchCV is called with only one argument search.fit(dataset)
dataset_ref = dataset
if isinstance(dataset, list):
dataset = Dataset.from_list_of_samples(dataset)
X = dataset.get_scalars_to_tabular(
scalar_names = self.input_names,
as_nparray = True
)
if isinstance(dataset_ref, list):
dataset_ref = Dataset.from_list_of_samples(dataset_ref)
y = dataset_ref.get_scalars_to_tabular(
scalar_names = self.output_names,
as_nparray = True
)
return self.model.score(X, y)

Loading
Loading