From 55bbd85fd431f4dad591f0af3531fc66998f5586 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Tue, 24 Jun 2025 17:26:26 +0200 Subject: [PATCH 01/19] wip --- examples/pipeline/pipefunc_ex.py | 67 +++++++++++++ examples/pipeline/pipefunc_tensile2d.py | 122 ++++++++++++++++++++++++ examples/pipeline/test.py | 13 +++ src/plaid/bridges/huggingface_bridge.py | 33 ++++--- 4 files changed, 222 insertions(+), 13 deletions(-) create mode 100644 examples/pipeline/pipefunc_ex.py create mode 100644 examples/pipeline/pipefunc_tensile2d.py create mode 100644 examples/pipeline/test.py diff --git a/examples/pipeline/pipefunc_ex.py b/examples/pipeline/pipefunc_ex.py new file mode 100644 index 0000000..05e2617 --- /dev/null +++ b/examples/pipeline/pipefunc_ex.py @@ -0,0 +1,67 @@ +import numpy as np +from skimage import data, filters, measure +from skimage.color import rgb2gray +from skimage.segmentation import find_boundaries + +from pipefunc import Pipeline, pipefunc + + +# Step 1: Image Loading and Preprocessing +@pipefunc(output_name="gray_image", mapspec="image[n] -> gray_image[n]") +def load_and_preprocess_image(image): + return rgb2gray(image) + + +# Step 2: Image Segmentation +@pipefunc(output_name="segmented_image", mapspec="gray_image[n] -> segmented_image[n]") +def segment_image(gray_image): + return filters.sobel(gray_image) + + +# Step 3: Feature Extraction +@pipefunc(output_name="feature", mapspec="segmented_image[n] -> feature[n]") +def extract_feature(segmented_image): + boundaries = find_boundaries(segmented_image > 0.1) + labeled_image = measure.label(boundaries) + num_regions = np.max(labeled_image) + return {"num_regions": num_regions} + + +# Step 4: Object Classification +@pipefunc(output_name="classification", mapspec="feature[n] -> classification[n]") +def classify_object(feature): + # Classify image as 'Complex' if the number of regions is above a threshold. + classification = "Complex" if feature["num_regions"] > 5 else "Simple" + return classification + + +# Step 5: Result Aggregation +@pipefunc(output_name="summary") +def aggregate_results(classification): + simple_count = sum(1 for c in classification if c == "Simple") + complex_count = len(classification) - simple_count + return {"Simple": simple_count, "Complex": complex_count} + + +if __name__ == "__main__": + # Create the pipeline + pipeline_img = Pipeline( + [ + load_and_preprocess_image, + segment_image, + extract_feature, + classify_object, + aggregate_results, + ], + ) + + # Simulate a batch of images (using built-in scikit-image sample images) + images = [ + data.astronaut(), + data.coffee(), + data.coffee(), + ] # Repeat the coffee image to simulate multiple images + + # Run the pipeline on the images + results_summary = pipeline_img.map({"image": images}) + print("Classification Summary:", results_summary["summary"].output) \ No newline at end of file diff --git a/examples/pipeline/pipefunc_tensile2d.py b/examples/pipeline/pipefunc_tensile2d.py new file mode 100644 index 0000000..c14912a --- /dev/null +++ b/examples/pipeline/pipefunc_tensile2d.py @@ -0,0 +1,122 @@ +from datasets import load_from_disk +from plaid.containers.sample import Sample +from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition +import os, pickle +from safetensors.numpy import save_file + +from pipefunc import Pipeline, pipefunc +from sklearn.preprocessing import StandardScaler + + + +# ids_train = hf_dataset.description["split"]['train_500'] +# ids_test = hf_dataset.description["split"]['test'] + +# sample_train_0 = Sample.model_validate(pickle.loads(hf_dataset[ids_train[0]]["sample"])) +# sample_test_0 = Sample.model_validate(pickle.loads(hf_dataset[ids_test[0]]["sample"])) + + +# Step 1: Image Loading and Preprocessing +@pipefunc(output_name="hf_dataset") +def load(path): + return load_from_disk(path) + +@pipefunc(output_name="dataset") +def convert_to_plaid(hf_dataset): + return huggingface_dataset_to_plaid(hf_dataset)[0] + +@pipefunc(output_name="problem_definition") +def generate_prob_def(hf_dataset): + return huggingface_description_to_problem_definition(hf_dataset.description) + +# # Step 2: Image Segmentation +@pipefunc(output_name="scaler") +def scale_scalars(dataset, problem_definition, train_split_name, test_split_name): + + ids_train = problem_definition.get_split(train_split_name) + train_scalars = dataset.get_scalars_to_tabular( + sample_ids = ids_train + ) + + ids_test = problem_definition.get_split(test_split_name) + test_scalars = dataset.get_scalars_to_tabular( + sample_ids = ids_test + ) + + for sn in problem_definition.get_input_scalars_names(): + scaler = StandardScaler() + scaler.fit_transform(train_scalars[sn].reshape(-1, 1)) + scaler.fit(test_scalars[sn].reshape(-1, 1)) + + # print(ids_train) + # print(ids_test) + return scaler + +@pipefunc(output_name="saved_path") +def save(scaler, out_path): + + os.makedirs(out_path, exist_ok=True) + + # Save only NumPy-compatible parameters + tensors = { + "scaler_mean": scaler.mean_, + "scaler_scale": scaler.scale_, + } + saved_path = os.path.join(out_path, "scaler.safetensors") + save_file(tensors, saved_path) + return saved_path + +# # Step 3: Feature Extraction +# @pipefunc(output_name="feature", mapspec="segmented_image[n] -> feature[n]") +# def extract_feature(segmented_image): +# boundaries = find_boundaries(segmented_image > 0.1) +# labeled_image = measure.label(boundaries) +# num_regions = np.max(labeled_image) +# return {"num_regions": num_regions} + + +# # Step 4: Object Classification +# @pipefunc(output_name="classification", mapspec="feature[n] -> classification[n]") +# def classify_object(feature): +# # Classify image as 'Complex' if the number of regions is above a threshold. +# classification = "Complex" if feature["num_regions"] > 5 else "Simple" +# return classification + + +# # Step 5: Result Aggregation +# @pipefunc(output_name="summary") +# def aggregate_results(classification): +# simple_count = sum(1 for c in classification if c == "Simple") +# complex_count = len(classification) - simple_count +# return {"Simple": simple_count, "Complex": complex_count} + + +if __name__ == "__main__": + # Create the pipeline + pipeline = Pipeline( + [ + load, + convert_to_plaid, + generate_prob_def, + scale_scalars, + save, + ], + profile=True + ) + + # pipeline.visualize() + + path = "Z:\\Users\\d582428\\Downloads\\Tensile2d" + out_path = "Z:\\Users\\d582428\\Downloads\\Tensile2d\\artifacts" + + train_split_name = "train_500" + test_split_name = "test" + + # Run the pipeline + pipeline("saved_path", + path = path, + train_split_name=train_split_name, + test_split_name=test_split_name, + out_path = out_path) + pipeline.print_profiling_stats() + # print("Dataset:", type(dataset[0:10]), type(dataset)) diff --git a/examples/pipeline/test.py b/examples/pipeline/test.py new file mode 100644 index 0000000..a04d664 --- /dev/null +++ b/examples/pipeline/test.py @@ -0,0 +1,13 @@ +from datasets import load_from_disk +from plaid.containers.sample import Sample +from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition + + + +path = "Z:\\Users\\d582428\\Downloads\\Tensile2d" + + +hf_dataset = load_from_disk(path) +ds, pbd = huggingface_dataset_to_plaid(hf_dataset) +for sample in ds: + print(sample) \ No newline at end of file diff --git a/src/plaid/bridges/huggingface_bridge.py b/src/plaid/bridges/huggingface_bridge.py index ac78d0b..f3a5af4 100644 --- a/src/plaid/bridges/huggingface_bridge.py +++ b/src/plaid/bridges/huggingface_bridge.py @@ -135,6 +135,25 @@ def plaid_generator_to_huggingface( return ds +def huggingface_description_to_problem_definition( + description: dict, +) -> ProblemDefinition: + """Docstring to complete.""" + problem_definition = ProblemDefinition() + problem_definition.set_task(description["task"]) + problem_definition.set_split(description["split"]) + problem_definition.add_input_scalars_names(description["in_scalars_names"]) + problem_definition.add_output_scalars_names(description["out_scalars_names"]) + problem_definition.add_input_timeseries_names(description["in_timeseries_names"]) + problem_definition.add_output_timeseries_names(description["out_timeseries_names"]) + problem_definition.add_input_fields_names(description["in_fields_names"]) + problem_definition.add_output_fields_names(description["out_fields_names"]) + problem_definition.add_input_meshes_names(description["in_meshes_names"]) + problem_definition.add_output_meshes_names(description["out_meshes_names"]) + + return problem_definition + + def huggingface_dataset_to_plaid( ds: datasets.Dataset, ) -> tuple[Self, ProblemDefinition]: @@ -172,19 +191,7 @@ def huggingface_dataset_to_plaid( dataset.set_infos(infos) - problem_definition = ProblemDefinition() - problem_definition.set_task(ds.description["task"]) - problem_definition.set_split(ds.description["split"]) - problem_definition.add_input_scalars_names(ds.description["in_scalars_names"]) - problem_definition.add_output_scalars_names(ds.description["out_scalars_names"]) - problem_definition.add_input_timeseries_names(ds.description["in_timeseries_names"]) - problem_definition.add_output_timeseries_names( - ds.description["out_timeseries_names"] - ) - problem_definition.add_input_fields_names(ds.description["in_fields_names"]) - problem_definition.add_output_fields_names(ds.description["out_fields_names"]) - problem_definition.add_input_meshes_names(ds.description["in_meshes_names"]) - problem_definition.add_output_meshes_names(ds.description["out_meshes_names"]) + problem_definition = huggingface_description_to_problem_definition(ds.description) return dataset, problem_definition From 6d43c2a5dcff2c54ac0fce465c71469e80de3b32 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Wed, 25 Jun 2025 18:41:02 +0200 Subject: [PATCH 02/19] update --- examples/pipeline/config.yml | 9 ++ examples/pipeline/pipefunc_tensile2d.py | 147 ++++++++++++------------ examples/pipeline/test.py | 13 --- 3 files changed, 83 insertions(+), 86 deletions(-) create mode 100644 examples/pipeline/config.yml delete mode 100644 examples/pipeline/test.py diff --git a/examples/pipeline/config.yml b/examples/pipeline/config.yml new file mode 100644 index 0000000..af5f63a --- /dev/null +++ b/examples/pipeline/config.yml @@ -0,0 +1,9 @@ +load: + - path: + +scale_scalars: + - train_split_name: "train_500" + - test_split_name: "test" + +save: + - out_path: ".\\artifacts" \ No newline at end of file diff --git a/examples/pipeline/pipefunc_tensile2d.py b/examples/pipeline/pipefunc_tensile2d.py index c14912a..4f3a15d 100644 --- a/examples/pipeline/pipefunc_tensile2d.py +++ b/examples/pipeline/pipefunc_tensile2d.py @@ -7,97 +7,96 @@ from pipefunc import Pipeline, pipefunc from sklearn.preprocessing import StandardScaler +import yaml -# ids_train = hf_dataset.description["split"]['train_500'] -# ids_test = hf_dataset.description["split"]['test'] +@pipefunc(output_name=("dataset", "prob_def")) +def load_hf_from_disk(path): + return huggingface_dataset_to_plaid(load_from_disk(path)) -# sample_train_0 = Sample.model_validate(pickle.loads(hf_dataset[ids_train[0]]["sample"])) -# sample_test_0 = Sample.model_validate(pickle.loads(hf_dataset[ids_test[0]]["sample"])) +@pipefunc(output_name="scalar_scalers") +def scale_scalars(dataset, prob_def, train_split_name, test_split_name): + print(">>", dataset[0].get_scalar("p1")) -# Step 1: Image Loading and Preprocessing -@pipefunc(output_name="hf_dataset") -def load(path): - return load_from_disk(path) - -@pipefunc(output_name="dataset") -def convert_to_plaid(hf_dataset): - return huggingface_dataset_to_plaid(hf_dataset)[0] - -@pipefunc(output_name="problem_definition") -def generate_prob_def(hf_dataset): - return huggingface_description_to_problem_definition(hf_dataset.description) - -# # Step 2: Image Segmentation -@pipefunc(output_name="scaler") -def scale_scalars(dataset, problem_definition, train_split_name, test_split_name): - - ids_train = problem_definition.get_split(train_split_name) + ids_train = prob_def.get_split(train_split_name) train_scalars = dataset.get_scalars_to_tabular( sample_ids = ids_train ) - ids_test = problem_definition.get_split(test_split_name) + ids_test = prob_def.get_split(test_split_name) test_scalars = dataset.get_scalars_to_tabular( sample_ids = ids_test ) - for sn in problem_definition.get_input_scalars_names(): + scalar_scalers = {} + + print(prob_def.get_input_scalars_names()) + + for sn in prob_def.get_input_scalars_names(): + scaler = StandardScaler() + scaler.fit_transform(train_scalars[sn].reshape(-1, 1)) + scaler.transform(test_scalars[sn].reshape(-1, 1)) + scalar_scalers[sn] = scaler + + for sn in prob_def.get_output_scalars_names(): scaler = StandardScaler() scaler.fit_transform(train_scalars[sn].reshape(-1, 1)) - scaler.fit(test_scalars[sn].reshape(-1, 1)) + scalar_scalers[sn] = scaler - # print(ids_train) - # print(ids_test) - return scaler + for j, i in enumerate(ids_train): + sample = dataset[i] + for sn, scaler in scalar_scalers.items(): + if sn in sample.get_scalar_names(): + sample.add_scalar(sn, train_scalars[sn][j]) + + for j, i in enumerate(ids_test): + sample = dataset[i] + for sn, scaler in scalar_scalers.items(): + if sn in sample.get_scalar_names(): + sample.add_scalar(sn, test_scalars[sn][j]) + + + print(">>", dataset[0].get_scalar("p1")) + + return scalar_scalers @pipefunc(output_name="saved_path") -def save(scaler, out_path): +def save(scalar_scalers, out_path, dataset): os.makedirs(out_path, exist_ok=True) # Save only NumPy-compatible parameters - tensors = { - "scaler_mean": scaler.mean_, - "scaler_scale": scaler.scale_, - } - saved_path = os.path.join(out_path, "scaler.safetensors") - save_file(tensors, saved_path) - return saved_path - -# # Step 3: Feature Extraction -# @pipefunc(output_name="feature", mapspec="segmented_image[n] -> feature[n]") -# def extract_feature(segmented_image): -# boundaries = find_boundaries(segmented_image > 0.1) -# labeled_image = measure.label(boundaries) -# num_regions = np.max(labeled_image) -# return {"num_regions": num_regions} - - -# # Step 4: Object Classification -# @pipefunc(output_name="classification", mapspec="feature[n] -> classification[n]") -# def classify_object(feature): -# # Classify image as 'Complex' if the number of regions is above a threshold. -# classification = "Complex" if feature["num_regions"] > 5 else "Simple" -# return classification - - -# # Step 5: Result Aggregation -# @pipefunc(output_name="summary") -# def aggregate_results(classification): -# simple_count = sum(1 for c in classification if c == "Simple") -# complex_count = len(classification) - simple_count -# return {"Simple": simple_count, "Complex": complex_count} + for sn, scaler in scalar_scalers.items(): + tensors = { + "scaler_mean": scaler.mean_, + "scaler_scale": scaler.scale_, + } + saved_path = os.path.join(out_path, f"scaler_{sn}.safetensors") + save_file(tensors, saved_path) + + return dataset + + +def extract_leaf_keys(d): + leaves = {} + if isinstance(d, dict): + for k, v in d.items(): + if isinstance(v, dict) or isinstance(v, list): + leaves.update(extract_leaf_keys(v)) + else: + leaves[k] = v + elif isinstance(d, list): + for item in d: + leaves.update(extract_leaf_keys(item)) + return leaves if __name__ == "__main__": # Create the pipeline pipeline = Pipeline( [ - load, - convert_to_plaid, - generate_prob_def, + load_hf_from_disk, scale_scalars, save, ], @@ -106,17 +105,19 @@ def save(scaler, out_path): # pipeline.visualize() - path = "Z:\\Users\\d582428\\Downloads\\Tensile2d" - out_path = "Z:\\Users\\d582428\\Downloads\\Tensile2d\\artifacts" + with open("config.yml") as f: + config = yaml.safe_load(f) + + parameters = extract_leaf_keys(config) + print(parameters) - train_split_name = "train_500" - test_split_name = "test" # Run the pipeline - pipeline("saved_path", - path = path, - train_split_name=train_split_name, - test_split_name=test_split_name, - out_path = out_path) - pipeline.print_profiling_stats() + dataset = pipeline(**parameters) + # pipeline.print_profiling_stats() # print("Dataset:", type(dataset[0:10]), type(dataset)) + + print(dataset) + + print(">>", dataset[0].get_scalar("p1")) + diff --git a/examples/pipeline/test.py b/examples/pipeline/test.py deleted file mode 100644 index a04d664..0000000 --- a/examples/pipeline/test.py +++ /dev/null @@ -1,13 +0,0 @@ -from datasets import load_from_disk -from plaid.containers.sample import Sample -from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition - - - -path = "Z:\\Users\\d582428\\Downloads\\Tensile2d" - - -hf_dataset = load_from_disk(path) -ds, pbd = huggingface_dataset_to_plaid(hf_dataset) -for sample in ds: - print(sample) \ No newline at end of file From c2e96d226ea10e1a0999df49e12dc8ea728d6db8 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Sat, 28 Jun 2025 08:45:38 +0200 Subject: [PATCH 03/19] wip --- examples/pipeline/config.yml | 6 +- examples/pipeline/pipefunc_tensile2d.py | 115 ++++++++++++------------ 2 files changed, 59 insertions(+), 62 deletions(-) diff --git a/examples/pipeline/config.yml b/examples/pipeline/config.yml index af5f63a..7348e7a 100644 --- a/examples/pipeline/config.yml +++ b/examples/pipeline/config.yml @@ -1,9 +1,9 @@ -load: - - path: +load_hf_from_hub: + - path: "PLAID-datasets/Tensile2d" scale_scalars: - train_split_name: "train_500" - test_split_name: "test" save: - - out_path: ".\\artifacts" \ No newline at end of file + - out_path: "./artifacts" \ No newline at end of file diff --git a/examples/pipeline/pipefunc_tensile2d.py b/examples/pipeline/pipefunc_tensile2d.py index 4f3a15d..f67a67c 100644 --- a/examples/pipeline/pipefunc_tensile2d.py +++ b/examples/pipeline/pipefunc_tensile2d.py @@ -1,81 +1,79 @@ -from datasets import load_from_disk +from datasets import load_from_disk, load_dataset from plaid.containers.sample import Sample from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition import os, pickle +import numpy as np from safetensors.numpy import save_file from pipefunc import Pipeline, pipefunc from sklearn.preprocessing import StandardScaler import yaml +import time @pipefunc(output_name=("dataset", "prob_def")) def load_hf_from_disk(path): return huggingface_dataset_to_plaid(load_from_disk(path)) -@pipefunc(output_name="scalar_scalers") -def scale_scalars(dataset, prob_def, train_split_name, test_split_name): +@pipefunc(output_name=("dataset", "prob_def")) +def load_hf_from_hub(path): + return huggingface_dataset_to_plaid(load_dataset(path, split="all_samples")) - print(">>", dataset[0].get_scalar("p1")) +@pipefunc(output_name=("scalar_data")) +def scale_scalars(dataset, prob_def, train_split_name, test_split_name, out_path): ids_train = prob_def.get_split(train_split_name) - train_scalars = dataset.get_scalars_to_tabular( - sample_ids = ids_train + input_scalars_train = dataset.get_scalars_to_tabular( + scalar_names = prob_def.get_input_scalars_names(), + sample_ids = ids_train, + as_nparray = True + ) + output_scalars_train = dataset.get_scalars_to_tabular( + scalar_names = prob_def.get_output_scalars_names(), + sample_ids = ids_train, + as_nparray = True ) ids_test = prob_def.get_split(test_split_name) - test_scalars = dataset.get_scalars_to_tabular( - sample_ids = ids_test + input_scalars_test = dataset.get_scalars_to_tabular( + scalar_names = prob_def.get_input_scalars_names(), + sample_ids = ids_test, + as_nparray = True ) - scalar_scalers = {} - - print(prob_def.get_input_scalars_names()) - - for sn in prob_def.get_input_scalars_names(): - scaler = StandardScaler() - scaler.fit_transform(train_scalars[sn].reshape(-1, 1)) - scaler.transform(test_scalars[sn].reshape(-1, 1)) - scalar_scalers[sn] = scaler - - for sn in prob_def.get_output_scalars_names(): - scaler = StandardScaler() - scaler.fit_transform(train_scalars[sn].reshape(-1, 1)) - scalar_scalers[sn] = scaler - - for j, i in enumerate(ids_train): - sample = dataset[i] - for sn, scaler in scalar_scalers.items(): - if sn in sample.get_scalar_names(): - sample.add_scalar(sn, train_scalars[sn][j]) - - for j, i in enumerate(ids_test): - sample = dataset[i] - for sn, scaler in scalar_scalers.items(): - if sn in sample.get_scalar_names(): - sample.add_scalar(sn, test_scalars[sn][j]) + input_scalar_scaler = StandardScaler() + input_scalars_train = input_scalar_scaler.fit_transform(input_scalars_train) + input_scalars_test = input_scalar_scaler.transform(input_scalars_test) + output_scalar_scaler = StandardScaler() + output_scalars_train = output_scalar_scaler.fit_transform(output_scalars_train) - print(">>", dataset[0].get_scalar("p1")) - - return scalar_scalers - -@pipefunc(output_name="saved_path") -def save(scalar_scalers, out_path, dataset): + scalar_data = [ + input_scalar_scaler, + output_scalar_scaler, + input_scalars_train, + input_scalars_test, + output_scalars_train + ] os.makedirs(out_path, exist_ok=True) - # Save only NumPy-compatible parameters - for sn, scaler in scalar_scalers.items(): - tensors = { - "scaler_mean": scaler.mean_, - "scaler_scale": scaler.scale_, - } - saved_path = os.path.join(out_path, f"scaler_{sn}.safetensors") - save_file(tensors, saved_path) + tensors = { + "scaler_mean": input_scalar_scaler.mean_, + "scaler_scale": input_scalar_scaler.scale_, + } + saved_path = os.path.join(out_path, f"input_scalar_scaler.safetensors") + save_file(tensors, saved_path) - return dataset + tensors = { + "scaler_mean": output_scalar_scaler.mean_, + "scaler_scale": output_scalar_scaler.scale_, + } + saved_path = os.path.join(out_path, f"output_scalar_scaler.safetensors") + save_file(tensors, saved_path) + + return scalar_data def extract_leaf_keys(d): @@ -93,12 +91,13 @@ def extract_leaf_keys(d): if __name__ == "__main__": - # Create the pipeline + + start = time.time() + pipeline = Pipeline( [ - load_hf_from_disk, + load_hf_from_hub, scale_scalars, - save, ], profile=True ) @@ -109,15 +108,13 @@ def extract_leaf_keys(d): config = yaml.safe_load(f) parameters = extract_leaf_keys(config) - print(parameters) + scalar_data = pipeline(**parameters) - # Run the pipeline - dataset = pipeline(**parameters) - # pipeline.print_profiling_stats() - # print("Dataset:", type(dataset[0:10]), type(dataset)) + print("Pipeline execution time:", time.time() - start) - print(dataset) - print(">>", dataset[0].get_scalar("p1")) + pipeline.print_profiling_stats() + # print("Dataset:", type(dataset[0:10]), type(dataset)) + # print(scalar_data) From 32d8d758950ab6ebc83faf927684f6228cd45c03 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Sun, 29 Jun 2025 15:30:41 +0200 Subject: [PATCH 04/19] wip --- examples/pipeline/ml_pipeline_nodes.py | 377 ++++++++++++++++++++++++ examples/pipeline/pipefunc_tensile2d.py | 24 +- examples/pipeline/sklearn_pipeline.py | 100 +++++++ src/plaid/bridges/huggingface_bridge.py | 94 +++++- 4 files changed, 584 insertions(+), 11 deletions(-) create mode 100644 examples/pipeline/ml_pipeline_nodes.py create mode 100644 examples/pipeline/sklearn_pipeline.py diff --git a/examples/pipeline/ml_pipeline_nodes.py b/examples/pipeline/ml_pipeline_nodes.py new file mode 100644 index 0000000..429e90f --- /dev/null +++ b/examples/pipeline/ml_pipeline_nodes.py @@ -0,0 +1,377 @@ +import os +import json +import joblib +from joblib import Parallel, delayed +import numpy as np +from pathlib import Path +from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin +from sklearn.linear_model import Ridge +from sklearn.decomposition import PCA + + +class PersistentNode(BaseEstimator, RegressorMixin, TransformerMixin): + def __init__(self, save_path): + self.save_path = Path(save_path) + self.fitted_ = False + self.model = None + + def save(self, obj): + self.save_path.parent.mkdir(parents=True, exist_ok=True) + joblib.dump(obj, self.save_path) + self.fitted_ = True + + def load(self): + print(f"Loading existing model from {self.save_path}") + obj = joblib.load(self.save_path) + self.fitted_ = True + return obj + + def exists(self): + return self.save_path.exists() + + def check_fitted_or_load(self): + if self.fitted_: + return + if self.exists(): + self.set_model(self.load()) + else: + raise ValueError("Model not fitted and no saved model found.") + + def set_model(self, obj): + self.model = obj + + def fit(self, X, y=None): + if self.exists(): + self.set_model(self.load()) + return self + self._fit(X, y) + self.save(self.model) + return self + + def transform(self, X): + self.check_fitted_or_load() + return self._transform(X) + + def inverse_transform(self, X): + self.check_fitted_or_load() + return self._inverse_transform(X) + + def predict(self, X): + self.check_fitted_or_load() + return self._predict(X) + + def score(self, X, y): + self.check_fitted_or_load() + return self._score(X, y) + + # Protected methods to override in subclasses + def _fit(self, X, y=None): + raise NotImplementedError + + def _predict(self, X): + raise NotImplementedError + + def _transform(self, X): + raise NotImplementedError + + def _inverse_transform(self, X): + raise NotImplementedError + + def _score(self, X, y): + raise NotImplementedError + + + +from sklearn.preprocessing import StandardScaler, MinMaxScaler + +available_scalers = { + "StandardScaler": StandardScaler, + "MinMaxScaler": MinMaxScaler, +} + +class ScalarScalerNode(PersistentNode): + + def __init__(self, name, params): + super().__init__(os.path.join(params['save_path'], f"{name}.joblib")) + self.scalar_names = params[name]['scalar_names'] + self.model = available_scalers[params[name]['type']]() + + def get_scalars(self, dataset): + return dataset.get_scalars_to_tabular( + scalar_names = self.scalar_names, + as_nparray = True + ) + + def set_scalars(self, dataset, scalars): + for i in range(len(dataset)): + for j, sn in enumerate(self.scalar_names): + dataset[i].add_scalar(sn, scalars[i, j]) + + def _fit(self, dataset, y=None): + scalars = self.get_scalars(dataset) + self.model.fit(scalars) + + def _transform(self, dataset): + scalars = self.get_scalars(dataset) + scaled_scalars = self.model.transform(scalars) + self.set_scalars(dataset, scaled_scalars) + + return dataset + + def _inverse_transform(self, dataset): + scaled_scalars = self.get_scalars(dataset) + scalars = self.model.inverse_transform(scaled_scalars) + self.set_scalars(dataset, scalars) + + return dataset + + +class PCAEmbeddingNode(PersistentNode): + + def __init__(self, name:str, params:dict): + super().__init__(os.path.join(params['save_path'], f"{name}.joblib")) + + self.zone_name = params[name]["zone_name"] if "zone_name" in params[name] else None + self.base_name = params[name]["base_name"] if "base_name" in params[name] else None + self.time = params[name]["time"] if "time" in params[name] else None + self.location = params[name]["location"] if "location" in params[name] else "Vertex" + + assert params[name]['type'] == "PCA" + self.n_components = params[name]['n_components'] + self.field_names = list(self.n_components.keys()) + self.model = {name: PCA(n_components = nc) for name, nc in self.n_components.items()} + + def get_all_fields(self, dataset, fn): + all_fields = [] + for sample in dataset: + if fn == "nodes": + field = sample.get_nodes(self.zone_name, self.base_name, self.time).flatten() + else: + field = sample.get_field(fn, self.zone_name, self.base_name, self.location, self.time) + all_fields.append(field) + return np.array(all_fields) + + + def set_reduced_fields(self, dataset, fn, reduced_fields): + for i in range(len(dataset)): + for j in range(self.n_components[fn]): + dataset[i].add_scalar(f"reduced_{fn}_{j}", reduced_fields[i, j]) + + def get_reduced_fields(self, dataset, fn): + return dataset.get_scalars_to_tabular( + scalar_names = [f"reduced_{fn}_{j}" for j in range(self.n_components[fn])], + as_nparray = True + ) + + def set_fields(self, dataset, fn, fields): + for i in range(len(dataset)): + dataset[i].add_field(fn, fields[i], self.zone_name, self.base_name, self.location, self.time) + + def _fit(self, dataset, y=None): + for fn in self.field_names: + all_fields = self.get_all_fields(dataset, fn) + self.model[fn].fit(all_fields) + + def _transform(self, dataset): + for fn in self.field_names: + all_fields = self.get_all_fields(dataset, fn) + reduced_fields = self.model[fn].transform(all_fields) + self.set_reduced_fields(dataset, fn, reduced_fields) + return dataset + + def _inverse_transform(self, dataset): + for fn in self.field_names: + reduced_fields = self.get_reduced_fields(dataset, fn) + fields = self.model[fn].inverse_transform(reduced_fields) + self.set_fields(dataset, fn, fields) + return dataset + + +# from Muscat.Containers import MeshGraphTools as MGT +# from Muscat.Bridges.CGNSBridge import CGNSToMesh, MeshToCGNS +# from Muscat.Containers import MeshModificationTools as MMT + +# import sys +# from contextlib import contextmanager + +# @contextmanager +# def suppress_stdout(): +# original_stdout = sys.stdout +# sys.stdout = open(os.devnull, 'w') +# try: +# yield +# finally: +# sys.stdout.close() +# sys.stdout = original_stdout + +# class TutteMorphing(PersistentNode): +# def __init__(self, name, params): +# super().__init__(os.path.join(params['save_path'], f"{name}.joblib")) +# self.prob_def = params['prob_def'] +# self.loc_params = params[name] +# self.model = {} + +# def fit(self, X, y=None): +# # No fitting needed here +# return self + +# def transform(self, dataset): +# return Parallel(n_jobs=self.loc_params['n_jobs'])( +# delayed(self._process_row)(sample) for sample in dataset +# ) + +# # def inverse_transform(self, dataset): +# # return Parallel(n_jobs=self.n_jobs)( +# # delayed(self._process_row)(sample) for sample in dataset +# # ) + + +# def _process_row(self, sample): +# # Your custom transformation logic + +# mesh = CGNSToMesh(sample.get_mesh()) + +# with suppress_stdout(): +# mesh_renumb, renumbering, n_boundary = MGT.RenumberMeshForParametrization( +# mesh, inPlace=False) +# mesh_renumb.elemFields = mesh_renumb.nodeFields = {} +# morphed_mesh, _ = MGT.FloaterMeshParametrization( +# mesh_renumb, n_boundary) + +# # ---# Check invariance +# assert (np.all(renumbering == np.argsort(np.argsort(renumbering)))) +# MMT.NodesPermutation(morphed_mesh, np.argsort(renumbering)) + +# sample.del_tree(time = 0.) +# sample.add_tree(MeshToCGNS(morphed_mesh)) + +# return sample + + + + + +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel +from sklearn.multioutput import MultiOutputRegressor + +available_kernel_classes = { + "Matern":Matern +} + +class GPRegressorNode(PersistentNode): + + def __init__(self, name, params): + + super().__init__(os.path.join(params['save_path'], f"{name}.joblib")) + self.loc_params = params[name] + assert self.loc_params['type'] == "GaussianProcessRegressor" + + options = self.loc_params['options'] + assert options["kernel"] in available_kernel_classes.keys(), "scikit-learn kernel "+self.options["kernel"]+" not available" + kernel_class = available_kernel_classes[options["kernel"]] + + self.input_names = self.loc_params['input']['names'] + self.output_names = self.loc_params['output']['names'] + + if options["anisotropic"]: + kernel = ConstantKernel() * kernel_class(length_scale=np.ones(len(self.input_names)), length_scale_bounds=(1e-8, 1e8), + **options["kernel_options"]) + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) + else: + kernel = kernel_class(length_scale_bounds=(1e-8, 1e8), **options["kernel_options"]) \ + + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) + + gpr = GaussianProcessRegressor( + kernel=kernel, + optimizer=options["optim"], + n_restarts_optimizer=options["num_restarts"], + random_state = options["random_state"]) + + self.model = MultiOutputRegressor(gpr) + + + def get_scalars(self, dataset): + return dataset.get_scalars_to_tabular( + scalar_names = self.input_names, + as_nparray = True + ) + + def _fit(self, dataset, y=None): + X = dataset.get_scalars_to_tabular( + scalar_names = self.input_names, + as_nparray = True + ) + y = dataset.get_scalars_to_tabular( + scalar_names = self.output_names, + as_nparray = True + ) + self.model.fit(X, y) + + def _predict(self, dataset): + X = dataset.get_scalars_to_tabular( + scalar_names = self.input_names, + as_nparray = True + ) + + pred= self.model.predict(X) + if len(self.output_names) == 1: + pred = pred.reshape((-1, 1)) + + for i in range(len(dataset)): + for j, sn in enumerate(self.output_names): + dataset[i].add_scalar(sn, pred[i, j]) + + return dataset + + def _transform(self, dataset): + return dataset + + def _inverse_transform(self, dataset): + return dataset + + def _score(self, dataset, dataset_ref): + X = dataset.get_scalars_to_tabular( + scalar_names = self.input_names, + as_nparray = True + ) + y = dataset_ref.get_scalars_to_tabular( + scalar_names = self.output_names, + as_nparray = True + ) + return self.model.score(X, y) + + +class ScalerNode(PersistentNode): + def __init__(self, name, save_path): + super().__init__(os.path.join(save_path, f"{name}.joblib")) + self.model = StandardScaler() + + def _fit(self, X, y=None): + self.model.fit(X) + + def _transform(self, X): + return self.model.transform(X) + + def _inverse_transform(self, X): + return self.model.inverse_transform(X) + + def _predict(self, X): + raise AttributeError("ScalarScalerNode does not support predict.") + + +class RegressorNode(PersistentNode): + def __init__(self, name, save_path, alpha=1.0): + super().__init__(os.path.join(save_path, f"{name}.joblib")) + self.model = Ridge(alpha=alpha) + + def _fit(self, X, y): + self.model.fit(X, y) + + def _predict(self, X): + return self.model.predict(X) + + def _transform(self, X): + raise AttributeError("RegressorNode does not support transform.") + + def _inverse_transform(self, X): + raise AttributeError("RegressorNode does not support inverse_transform.") diff --git a/examples/pipeline/pipefunc_tensile2d.py b/examples/pipeline/pipefunc_tensile2d.py index f67a67c..54f3966 100644 --- a/examples/pipeline/pipefunc_tensile2d.py +++ b/examples/pipeline/pipefunc_tensile2d.py @@ -4,21 +4,32 @@ import os, pickle import numpy as np from safetensors.numpy import save_file +from sklearn.base import BaseEstimator, RegressorMixin from pipefunc import Pipeline, pipefunc from sklearn.preprocessing import StandardScaler +from pathlib import Path +import joblib + import yaml import time + @pipefunc(output_name=("dataset", "prob_def")) def load_hf_from_disk(path): return huggingface_dataset_to_plaid(load_from_disk(path)) + @pipefunc(output_name=("dataset", "prob_def")) def load_hf_from_hub(path): - return huggingface_dataset_to_plaid(load_dataset(path, split="all_samples")) + start = time.time() + hf_dataset = load_dataset(path, split="all_samples") + print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") + dataset = huggingface_dataset_to_plaid(hf_dataset) + return dataset + @pipefunc(output_name=("scalar_data")) def scale_scalars(dataset, prob_def, train_split_name, test_split_name, out_path): @@ -95,10 +106,11 @@ def extract_leaf_keys(d): start = time.time() pipeline = Pipeline( - [ - load_hf_from_hub, - scale_scalars, - ], + [ + load_hf_from_hub, + scale_scalars, + ], + name="ML_Workflow", profile=True ) @@ -111,7 +123,7 @@ def extract_leaf_keys(d): scalar_data = pipeline(**parameters) - print("Pipeline execution time:", time.time() - start) + print(f"Pipeline execution time {time.time() - start:.2g} seconds") pipeline.print_profiling_stats() diff --git a/examples/pipeline/sklearn_pipeline.py b/examples/pipeline/sklearn_pipeline.py new file mode 100644 index 0000000..2c91bad --- /dev/null +++ b/examples/pipeline/sklearn_pipeline.py @@ -0,0 +1,100 @@ +from datasets import load_dataset +from plaid.containers.sample import Sample +from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition +import os, pickle +import numpy as np +from safetensors.numpy import save_file +from sklearn.base import BaseEstimator, RegressorMixin + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from pathlib import Path +import joblib + +import yaml +import time + +from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode #, TutteMorphing + + +with open("config_2.yml") as f: + params = yaml.safe_load(f) + + +start = time.time() +hf_dataset = load_dataset(params['dataset_path'], split="all_samples") +print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") + +prob_def = huggingface_description_to_problem_definition(hf_dataset.description) + +train_split = prob_def.get_split(params['train_split_name'])[:20] +test_split = prob_def.get_split(params['train_split_name'])[:20] + +dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split) +dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split) + + + +pipeline = Pipeline([ + ('input_scalar_scaler', ScalarScalerNode(name = 'input_scalar_scaler', params = params)), + ('output_scalar_scaler', ScalarScalerNode(name = 'output_scalar_scaler', params = params)), + ('pca_shape_embedding', PCAEmbeddingNode(name = 'pca_shape_embedding', params = params)), + ('pca_field_embedding', PCAEmbeddingNode(name = 'pca_field_embedding', params = params)), + ('tabular_regressor', GPRegressorNode(name = 'tabular_regressor', params = params)) +]) + +ind = train_split[0] + +pipeline.fit(dataset_train) +print("pipeline fitted") + +dataset_test_2 = pipeline.predict(dataset_test) + +print("score =", pipeline.score(dataset_test, dataset_test_2)) + +print(dataset_test_2) + +# dataset_test._save_to_dir_(os.path.join(params['save_path'], "dataset_test"), verbose = True) +# dataset_test_2._save_to_dir_(os.path.join(params['save_path'], "dataset_test_2"), verbose = True) + +from Muscat.Bridges.CGNSBridge import CGNSToMesh, MeshToCGNS +from Muscat.IO.XdmfWriter import WriteMeshToXdmf + +mesh1 = CGNSToMesh(dataset_test[0].get_mesh(), baseNames = ["Base_2_2"]) +mesh2 = CGNSToMesh(dataset_test_2[0].get_mesh(), baseNames = ["Base_2_2"]) + +print(mesh1) +print(mesh2) + +WriteMeshToXdmf(os.path.join(params['save_path'], "mesh1.xdmf"), + mesh1, + PointFields = [dataset_test[0].get_field(fn, base_name = "Base_2_2") for fn in ['mach', 'nut']], + PointFieldsNames = ['mach', 'nut'] +) +WriteMeshToXdmf(os.path.join(params['save_path'], "mesh2.xdmf"), + mesh2, + PointFields = [dataset_test_2[0].get_field(fn, base_name = "Base_2_2") for fn in ['mach', 'nut']], + PointFieldsNames = ['mach', 'nut'] +) + +# print(dataset_train[ind]) +# print(dataset_train[ind].get_scalar_names()) + +1./0. +dataset_2 = pipeline.inverse_transform(pipeline.predict(dataset)) + +print(dataset_2[0].get_scalar('max_U2_top')) + +# dataset_3 = pipeline.inverse_transform(dataset_2) + +# print(dataset_3[0].get_scalar('p1')) + + + +# dataset_4 = pipeline.inverse_transform(pipeline.fit_transform(dataset)) +# print(dataset_4[0].get_scalar('p1')) + + +test_split = prob_def.get_split(params['test_split_name']) +dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split) diff --git a/src/plaid/bridges/huggingface_bridge.py b/src/plaid/bridges/huggingface_bridge.py index f3a5af4..26e1ff0 100644 --- a/src/plaid/bridges/huggingface_bridge.py +++ b/src/plaid/bridges/huggingface_bridge.py @@ -1,9 +1,15 @@ """Huggingface bridge for PLAID datasets.""" +import os import pickle +import shutil import sys +from multiprocessing import Pool from typing import Callable +from datasets import load_from_disk +from tqdm import tqdm + if sys.version_info >= (3, 11): from typing import Self else: # pragma: no cover @@ -83,7 +89,7 @@ def plaid_dataset_to_huggingface( """ def generator(): - for id in range(len(dataset)): + for id in tqdm(range(len(dataset))): yield { "sample": pickle.dumps(dataset[id].model_dump()), } @@ -97,7 +103,7 @@ def plaid_generator_to_huggingface( generator: Callable, infos: dict, problem_definition: ProblemDefinition, - processes_number: int = 1, + processes_number: int = os.cpu_count(), ) -> datasets.Dataset: """Use this function for creating a huggingface dataset from a sample generator function. @@ -138,7 +144,14 @@ def plaid_generator_to_huggingface( def huggingface_description_to_problem_definition( description: dict, ) -> ProblemDefinition: - """Docstring to complete.""" + """Converts a huggingface dataset description to a plaid problem definition. + + Args: + description (dict): the description field of a huggingface dataset, containing the problem definition + + Returns: + problem_definition (ProblemDefinition): the plaid problem definition initialized from the huggingface dataset description + """ problem_definition = ProblemDefinition() problem_definition.set_task(description["task"]) problem_definition.set_split(description["split"]) @@ -154,8 +167,34 @@ def huggingface_description_to_problem_definition( return problem_definition +class HFToPlaidSampleConverter: + """Class to convert a huggingface dataset sample to a plaid sample.""" + + def __init__(self, ds): + self.ds = ds + + def __call__(self, ind): + """Convert a single sample from the huggingface dataset to a plaid sample.""" + return Sample.model_validate(pickle.loads(self.ds[ind]["sample"])) + + +class HFShardToPlaidSampleConverter: + """Class to convert a huggingface dataset sample shard to a plaid sample.""" + + def __init__(self, shard_path): + self.ds = load_from_disk(shard_path) + + def __call__(self, idx): + """Convert a sample shard from the huggingface dataset to a plaid sample.""" + sample = self.ds[idx] + return Sample.model_validate(pickle.loads(sample["sample"])) + + def huggingface_dataset_to_plaid( ds: datasets.Dataset, + ids: list[int] = None, + processes_number: int = os.cpu_count(), + large_dataset=False, ) -> tuple[Self, ProblemDefinition]: """Use this function for converting a plaid dataset from a huggingface dataset. @@ -165,6 +204,9 @@ def huggingface_dataset_to_plaid( Args: ds (datasets.Dataset): the dataset in huggingface format to be converted + ids (list, optional): The specific sample IDs to load from the dataset. Defaults to None. + processes_number (int, optional): The number of processes used to generate the plaid dataset + large_dataset (bool, optional): if True, uses a variant where parallel worker do not each load the complete dataset Returns: dataset (Dataset): the converted dataset. @@ -180,8 +222,50 @@ def huggingface_dataset_to_plaid( plaid_dataset, plaid_problem = huggingface_dataset_to_plaid(dataset) """ dataset = Dataset() - for i in range(len(ds)): - dataset.add_sample(Sample.model_validate(pickle.loads(ds[i]["sample"]))) + + print("Converting huggingface dataset to plaid dataset...") + + if large_dataset: + if ids: + NotImplementedError( + "ids selection not implemented with large_dataset option" + ) + for i in range(processes_number): + shard = ds.shard(num_shards=processes_number, index=i) + shard.save_to_disk(f"shards/dataset_shard_{i}") + + def parallel_convert(shard_path, n_workers): + converter = HFShardToPlaidSampleConverter(shard_path) + with Pool(processes=n_workers) as pool: + return list( + tqdm( + pool.imap(converter, range(len(converter.ds))), + total=len(converter.ds), + ) + ) + + samples = [] + + for i in range(processes_number): + shard_path = os.path.join("shards", f"dataset_shard_{i}") + shard_samples = parallel_convert(shard_path, n_workers=processes_number) + samples.extend(shard_samples) + + dataset.add_samples(samples) + + if os.path.exists("shards"): + shutil.rmtree("shards") + + else: + if ids: + indices = ids + else: + indices = range(len(ds)) + with Pool(processes=processes_number) as pool: + for sample in tqdm( + pool.imap(HFToPlaidSampleConverter(ds), indices), total=len(indices) + ): + dataset.add_sample(sample) infos = {} if "legal" in ds.description: From 8b32d7427e43763fbb8c3f07276e1ec36304bef9 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Sun, 29 Jun 2025 15:51:50 +0200 Subject: [PATCH 05/19] wip --- examples/pipeline/config_2.yml | 60 ++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 examples/pipeline/config_2.yml diff --git a/examples/pipeline/config_2.yml b/examples/pipeline/config_2.yml new file mode 100644 index 0000000..b6c60a5 --- /dev/null +++ b/examples/pipeline/config_2.yml @@ -0,0 +1,60 @@ +dataset_path: "PLAID-datasets/VKI-LS59" +save_path: "./artifacts" +train_split_name: "train" +test_split_name: "test" + +input_scalar_scaler: + type: "MinMaxScaler" + scalar_names: + - angle_in + - mach_out + +output_scalar_scaler: + type: "MinMaxScaler" + scalar_names: + - Q + - power + - Pr + - Tr + - eth_is + - angle_out + +pca_shape_embedding: + type: "PCA" + base_name: Base_2_2 + n_components: + nodes: 3 + +pca_field_embedding: + type: "PCA" + base_name: Base_2_2 + n_components: + mach: 5 + nut: 8 + +tabular_regressor: + type: "GaussianProcessRegressor" + options: + kernel: Matern + kernel_options: + nu: 2.5 + optim: fmin_l_bfgs_b + num_restarts: 10 + anisotropic: True + random_state: 42 + show_warnings: False + input: + type: scalar + names: + - angle_in + - mach_out + - reduced_nodes_0 + - reduced_nodes_1 + - reduced_nodes_2 + output: + type: scalar + names: + - reduced_mach_0 + - reduced_mach_1 + - reduced_mach_2 + - reduced_mach_4 \ No newline at end of file From 64db62184a8f653783bfa7bc9b3b7e9dc862cfa1 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Sun, 29 Jun 2025 22:52:08 +0200 Subject: [PATCH 06/19] wip: a cleaned version of scikit-learn pipelines is provided in examples/pipelines --- .gitignore | 1 + examples/pipelines/config.yml | 57 +++ .../experiments/config_0.yml} | 0 .../pipelines/experiments/persistent_node.py | 448 ++++++++++++++++++ .../experiments}/pipefunc_ex.py | 0 .../experiments}/pipefunc_tensile2d.py | 0 .../experiments/pipefunc_tensile2d_2.py | 104 ++++ .../pipelines/experiments/regressor_node.py | 28 ++ examples/pipelines/experiments/scaler_node.py | 33 ++ .../experiments/sklearn_hf_pipeline.py | 90 ++++ examples/pipelines/ml_pipeline_nodes.py | 249 ++++++++++ examples/pipelines/sklearn_pipeline.py | 103 ++++ .../with_persistence}/config_2.yml | 10 +- .../with_persistence}/ml_pipeline_nodes.py | 87 +++- .../with_persistence}/sklearn_pipeline.py | 56 +-- src/plaid/containers/dataset.py | 41 +- 16 files changed, 1243 insertions(+), 64 deletions(-) create mode 100644 examples/pipelines/config.yml rename examples/{pipeline/config.yml => pipelines/experiments/config_0.yml} (100%) create mode 100644 examples/pipelines/experiments/persistent_node.py rename examples/{pipeline => pipelines/experiments}/pipefunc_ex.py (100%) rename examples/{pipeline => pipelines/experiments}/pipefunc_tensile2d.py (100%) create mode 100644 examples/pipelines/experiments/pipefunc_tensile2d_2.py create mode 100644 examples/pipelines/experiments/regressor_node.py create mode 100644 examples/pipelines/experiments/scaler_node.py create mode 100644 examples/pipelines/experiments/sklearn_hf_pipeline.py create mode 100644 examples/pipelines/ml_pipeline_nodes.py create mode 100644 examples/pipelines/sklearn_pipeline.py rename examples/{pipeline => pipelines/with_persistence}/config_2.yml (85%) rename examples/{pipeline => pipelines/with_persistence}/ml_pipeline_nodes.py (80%) rename examples/{pipeline => pipelines/with_persistence}/sklearn_pipeline.py (56%) diff --git a/.gitignore b/.gitignore index d77e9c7..cdc857c 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ docs/source/notebooks/*.yaml # Heavy data *.cgns !notebooks/ex_rotor37_pv.cgns +*artifact* ############################################################################### # .gitignore PATTERN FORMAT: diff --git a/examples/pipelines/config.yml b/examples/pipelines/config.yml new file mode 100644 index 0000000..90adff1 --- /dev/null +++ b/examples/pipelines/config.yml @@ -0,0 +1,57 @@ +global: + dataset_path: "PLAID-datasets/VKI-LS59" + save_path: "./artifacts" + train_split_name: "train" + test_split_name: "test" + +input_scalar_scaler: + type: "MinMaxScaler" + scalar_names: + - angle_in + - mach_out + +output_scalar_scaler: + type: "MinMaxScaler" + scalar_names: + - Q + - power + - Pr + - Tr + - eth_is + - angle_out + +pca_nodes: + base_name: Base_2_2 + field_name: nodes + n_components: 3 + +pca_mach: + base_name: Base_2_2 + field_name: mach + n_components: 5 + +pca_nut: + base_name: Base_2_2 + field_name: nut + n_components: 8 + +regressor_mach: + type: "GaussianProcessRegressor" + options: + kernel: Matern + kernel_options: + nu: 2.5 + optim: fmin_l_bfgs_b + num_restarts: 2 + anisotropic: True + random_state: 42 + show_warnings: False + input: + scalar_names: + - angle_in + - mach_out + vector_names: + - reduced_nodes + output: + vector_names: + - reduced_mach \ No newline at end of file diff --git a/examples/pipeline/config.yml b/examples/pipelines/experiments/config_0.yml similarity index 100% rename from examples/pipeline/config.yml rename to examples/pipelines/experiments/config_0.yml diff --git a/examples/pipelines/experiments/persistent_node.py b/examples/pipelines/experiments/persistent_node.py new file mode 100644 index 0000000..a4e9651 --- /dev/null +++ b/examples/pipelines/experiments/persistent_node.py @@ -0,0 +1,448 @@ +from sklearn.base import BaseEstimator, RegressorMixin +import joblib +import json +from pathlib import Path +import hashlib +import numpy as np + + +class PersistentNode(BaseEstimator): + def __init__(self, save_path="models/unnamed.joblib"): + self.save_path = save_path + self._path = Path(save_path) + self._meta_path = self._path.with_suffix('.meta.json') + self.fitted_ = False + self.model = None + + def _save_file(self, obj, input_hash=None): + self._path.parent.mkdir(parents=True, exist_ok=True) + joblib.dump(obj, self._path) + if input_hash is not None: + self._meta_path.write_text(json.dumps({'hash': input_hash})) + self.fitted_ = True + + def save_model_with_hash(self, obj, data): + input_hash = self.hash_input(data) + self._save_file(obj, input_hash=input_hash) + + def load(self): + obj = joblib.load(self._path) + self.fitted_ = True + return obj + + def exists(self): + return self._path.exists() and self._meta_path.exists() + + def get_stored_hash(self): + if self._meta_path.exists(): + return json.loads(self._meta_path.read_text()).get('hash') + return None + + def hash_input(self, data): + m = hashlib.sha256() + if isinstance(data, tuple): + for arr in data: + m.update(np.ascontiguousarray(arr).data) + else: + m.update(np.ascontiguousarray(data).data) + return m.hexdigest() + + def check_fitted_or_load(self): + if self.fitted_: + return + if self.exists(): + loaded = self.load() + self.set_model(loaded) + else: + raise ValueError("Model not fitted or not saved.") + + def set_model(self, obj): + self.model = obj + + def get_model(self): + return self.model + + def load_if_cached(self, data): + input_hash = self.hash_input(data) + if self.exists() and self.get_stored_hash() == input_hash: + self.set_model(self.load()) + return True + return False + + def get_params(self, deep=True): + params = super().get_params(deep=deep) + if deep and hasattr(self.model, "get_params"): + for k, v in self.model.get_params(deep=True).items(): + params[f"model__{k}"] = v + return params + + def set_params(self, **params): + model_params = {k[7:]: v for k, v in params.items() if k.startswith("model__")} + node_params = {k: v for k, v in params.items() if not k.startswith("model__")} + if node_params: + super().set_params(**node_params) + if model_params and self.model is not None: + self.model.set_params(**model_params) + return self + + + +from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.model_selection import GridSearchCV +from sklearn.datasets import make_regression +from sklearn.metrics import mean_squared_error +from pipefunc import pipefunc, Pipeline +import numpy as np +import inspect +from typing import Dict, Any, Optional + +class PipefuncSklearnWrapper(BaseEstimator, RegressorMixin): + """ + Scikit-learn compatible wrapper for pipefunc Pipeline objects. + + This wrapper allows pipefunc pipelines to be used with scikit-learn tools + like GridSearchCV, cross_val_score, etc. + """ + + def __init__(self, pipeline: Pipeline, prediction_output: str = "predictions", **kwargs): + """ + Initialize the wrapper. + + Parameters: + ----------- + pipeline : pipefunc.Pipeline + The pipefunc pipeline to wrap + prediction_output : str + Name of the pipeline output that contains predictions + **kwargs : dict + Additional parameters that can be tuned via GridSearchCV + """ + self.pipeline = pipeline + self.prediction_output = prediction_output + + # Store additional parameters + for key, value in kwargs.items(): + setattr(self, key, value) + + # Keep track of parameter names for get_params/set_params + self._param_names = set(kwargs.keys()) + self._param_names.add('pipeline') + self._param_names.add('prediction_output') + + # Training data storage + self.X_train_ = None + self.y_train_ = None + self.is_fitted_ = False + + def _rebuild_pipeline_with_params(self) -> Pipeline: + """ + Rebuild the pipeline with current parameters. + + This method attempts to inject current parameters into pipeline functions + that accept them as arguments. + """ + # For now, return the original pipeline + # In a more sophisticated implementation, you would: + # 1. Extract functions from the original pipeline + # 2. Create new functions with updated parameters + # 3. Rebuild the pipeline + return self.pipeline + + def _validate_pipeline_output(self, result): + """Validate that the pipeline returns the expected output.""" + if isinstance(result, dict): + if self.prediction_output not in result: + available_outputs = list(result.keys()) + raise ValueError( + f"Pipeline output '{self.prediction_output}' not found. " + f"Available outputs: {available_outputs}" + ) + return result[self.prediction_output] + else: + # Assume the result is the direct prediction + return result + + def fit(self, X, y): + """ + Fit the pipeline. + + Parameters: + ----------- + X : array-like of shape (n_samples, n_features) + Training data + y : array-like of shape (n_samples,) + Target values + + Returns: + -------- + self : object + Returns self for method chaining + """ + # Store training data + self.X_train_ = X.copy() if hasattr(X, 'copy') else np.array(X) + self.y_train_ = y.copy() if hasattr(y, 'copy') else np.array(y) + + # Rebuild pipeline with current parameters + self.pipeline_ = self._rebuild_pipeline_with_params() + + # Mark as fitted + self.is_fitted_ = True + + return self + + def predict(self, X): + """ + Make predictions using the fitted pipeline. + + Parameters: + ----------- + X : array-like of shape (n_samples, n_features) + Input data to predict on + + Returns: + -------- + predictions : array-like of shape (n_samples,) + Predicted values + """ + if not self.is_fitted_: + raise ValueError("This PipefuncSklearnWrapper instance is not fitted yet.") + + try: + # Execute the pipeline with training data and test data + # This assumes the pipeline expects both training and test data + result = self.pipeline_( + X=self.X_train_, + y=self.y_train_, + X_test=X + ) + + # Extract predictions from result + predictions = self._validate_pipeline_output(result) + + return predictions + + except Exception as e: + # Try alternative calling patterns + try: + # Maybe the pipeline expects different argument names + result = self.pipeline_( + X_train=self.X_train_, + y_train=self.y_train_, + X_test=X + ) + return self._validate_pipeline_output(result) + except: + # Try calling with specific output name + try: + result = self.pipeline_( + self.prediction_output, + X=self.X_train_, + y=self.y_train_, + X_test=X + ) + return result + except: + raise RuntimeError( + f"Failed to execute pipeline for prediction. " + f"Original error: {str(e)}" + ) + + def score(self, X, y): + """ + Return the coefficient of determination R^2 of the prediction. + + Parameters: + ----------- + X : array-like of shape (n_samples, n_features) + Test samples + y : array-like of shape (n_samples,) + True values for X + + Returns: + -------- + score : float + R^2 score + """ + from sklearn.metrics import r2_score + + predictions = self.predict(X) + return r2_score(y, predictions) + + def get_params(self, deep=True): + """ + Get parameters for this estimator. + + Parameters: + ----------- + deep : bool, default=True + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns: + -------- + params : dict + Parameter names mapped to their values + """ + params = {} + + # Get all stored parameters + for param_name in self._param_names: + if hasattr(self, param_name): + params[param_name] = getattr(self, param_name) + + # If deep=True, try to extract parameters from pipeline functions + if deep and hasattr(self, 'pipeline') and self.pipeline is not None: + try: + # Try to access pipefunc internal structure + if hasattr(self.pipeline, 'graph'): + # NetworkX graph-based approach + for node_name in self.pipeline.graph.nodes(): + node_data = self.pipeline.graph.nodes[node_name] + if 'func' in node_data: + func = node_data['func'] + # Extract function parameters + self._extract_function_params(func, node_name, params) + except Exception: + # If we can't extract deep parameters, continue silently + pass + + return params + + def _extract_function_params(self, func, node_name, params): + """Extract parameters from a pipeline function.""" + try: + # Get the original function if it's wrapped + original_func = func + if hasattr(func, '__wrapped__'): + original_func = func.__wrapped__ + + # Get function signature + sig = inspect.signature(original_func) + + # Extract parameters with defaults + for param_name, param in sig.parameters.items(): + if param.default is not inspect.Parameter.empty: + prefixed_name = f"{node_name}__{param_name}" + params[prefixed_name] = param.default + except Exception: + # If parameter extraction fails, continue silently + pass + + def set_params(self, **params): + """ + Set the parameters of this estimator. + + Parameters: + ----------- + **params : dict + Estimator parameters + + Returns: + -------- + self : object + Estimator instance + """ + valid_params = self.get_params(deep=True) + + for key, value in params.items(): + if key in valid_params: + setattr(self, key, value) + # Add to tracked parameters if not already there + if key not in self._param_names: + self._param_names.add(key) + else: + raise ValueError( + f"Invalid parameter {key} for estimator {self.__class__.__name__}. " + f"Valid parameters are: {sorted(valid_params.keys())}" + ) + + # Reset fitted state if parameters changed + self.is_fitted_ = False + + return self + + + + +# class PipefuncWrapper(BaseEstimator, RegressorMixin): +# def __init__(self, pipe=None): +# self.pipe = pipe + +# def fit(self, X, y): +# self.pipe.clear() # make sure it's fresh +# self.pipe.map({ +# "X": X, +# "y": y +# }) +# return self + +# def predict(self, X): +# # Replace inputs, remove cached results downstream +# self.pipe.clear() +# self.pipe.map({ +# "X": X +# }) +# return self.pipe["y_pred"] + +# def get_params(self, deep=True): +# """Get parameters for this estimator""" +# params = {} + +# # Get all parameters stored as instance attributes +# for param_name in self._param_names: +# if hasattr(self, param_name): +# params[param_name] = getattr(self, param_name) + +# # If deep=True and we have a pipeline, try to get function parameters +# if deep and self.pipe is not None: +# try: +# # pipefunc stores functions in a different way +# # Access the internal graph structure +# if hasattr(self.pipe, 'graph'): +# for node_name in self.pipe.graph.nodes(): +# node_data = self.pipe.graph.nodes[node_name] +# if 'func' in node_data: +# func = node_data['func'] +# # Get function signature parameters +# if hasattr(func, '__wrapped__'): # For decorated functions +# func = func.__wrapped__ +# sig = inspect.signature(func) +# for param_name, param in sig.parameters.items(): +# if param.default is not inspect.Parameter.empty: +# params[f"{node_name}__{param_name}"] = param.default +# elif hasattr(self.pipe, '_functions'): +# # Alternative access pattern +# for func in self.pipe._functions: +# func_name = getattr(func, 'output_name', func.__name__) +# if hasattr(func, '__wrapped__'): +# original_func = func.__wrapped__ +# else: +# original_func = func +# sig = inspect.signature(original_func) +# for param_name, param in sig.parameters.items(): +# if param.default is not inspect.Parameter.empty: +# params[f"{func_name}__{param_name}"] = param.default +# except Exception as e: +# # If we can't extract deep parameters, just continue +# # This ensures compatibility even if pipefunc internals change +# pass + +# return params + +# def set_params(self, **params): +# """Set parameters for this estimator""" +# valid_params = self.get_params(deep=True) + +# for key, value in params.items(): +# if key in valid_params: +# if "__" in key: +# # This is a nested parameter (function parameter) +# # Store it for use when rebuilding the pipeline +# setattr(self, key, value) +# else: +# # This is a top-level parameter +# setattr(self, key, value) +# if key not in self._param_names: +# self._param_names.add(key) +# else: +# raise ValueError(f"Invalid parameter {key}") \ No newline at end of file diff --git a/examples/pipeline/pipefunc_ex.py b/examples/pipelines/experiments/pipefunc_ex.py similarity index 100% rename from examples/pipeline/pipefunc_ex.py rename to examples/pipelines/experiments/pipefunc_ex.py diff --git a/examples/pipeline/pipefunc_tensile2d.py b/examples/pipelines/experiments/pipefunc_tensile2d.py similarity index 100% rename from examples/pipeline/pipefunc_tensile2d.py rename to examples/pipelines/experiments/pipefunc_tensile2d.py diff --git a/examples/pipelines/experiments/pipefunc_tensile2d_2.py b/examples/pipelines/experiments/pipefunc_tensile2d_2.py new file mode 100644 index 0000000..e143006 --- /dev/null +++ b/examples/pipelines/experiments/pipefunc_tensile2d_2.py @@ -0,0 +1,104 @@ +from datasets import load_from_disk, load_dataset +from plaid.containers.sample import Sample +from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition +import os, pickle +import numpy as np +from safetensors.numpy import save_file +from sklearn.base import BaseEstimator, RegressorMixin + +from pipefunc import Pipeline, pipefunc +from sklearn.preprocessing import StandardScaler + +from pathlib import Path +import joblib + +import yaml +import time + +from ml_pipeline_nodes import ScalarScalerNode + + +@pipefunc(output_name=("dataset", "prob_def")) +def load_hf_from_hub(path): + start = time.time() + hf_dataset = load_dataset(path, split="all_samples") + print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") + dataset = huggingface_dataset_to_plaid(hf_dataset) + return dataset + + +@pipefunc(output_name=("scaled_dataset")) +def scale_scalars(dataset, prob_def, train_split_name, test_split_name): + + scalar_scaler = ScalarScalerNode() + + ids_train = prob_def.get_split(train_split_name) + input_scalars_train = dataset.get_scalars_to_tabular( + scalar_names = prob_def.get_input_scalars_names(), + sample_ids = ids_train, + as_nparray = True + ) + output_scalars_train = dataset.get_scalars_to_tabular( + scalar_names = prob_def.get_output_scalars_names(), + sample_ids = ids_train, + as_nparray = True + ) + + ids_test = prob_def.get_split(test_split_name) + input_scalars_test = dataset.get_scalars_to_tabular( + scalar_names = prob_def.get_input_scalars_names(), + sample_ids = ids_test, + as_nparray = True + ) + + input_scalar_scaler = StandardScaler() + input_scalars_train = input_scalar_scaler.fit_transform(input_scalars_train) + input_scalars_test = input_scalar_scaler.transform(input_scalars_test) + + output_scalar_scaler = StandardScaler() + output_scalars_train = output_scalar_scaler.fit_transform(output_scalars_train) + + return dataset + + +def extract_leaf_keys(d): + leaves = {} + if isinstance(d, dict): + for k, v in d.items(): + if isinstance(v, dict) or isinstance(v, list): + leaves.update(extract_leaf_keys(v)) + else: + leaves[k] = v + elif isinstance(d, list): + for item in d: + leaves.update(extract_leaf_keys(item)) + return leaves + + +if __name__ == "__main__": + + start = time.time() + + pipeline = Pipeline( + [ + load_hf_from_hub, + scale_scalars, + ], + name="ML_Workflow", + profile=True + ) + + # pipeline.visualize() + + with open("config_2.yml") as f: + parameters = yaml.safe_load(f) + + scalar_data = pipeline(**parameters) + + print(f"Pipeline execution time {time.time() - start:.2g} seconds") + + + pipeline.print_profiling_stats() + # print("Dataset:", type(dataset[0:10]), type(dataset)) + + # print(scalar_data) diff --git a/examples/pipelines/experiments/regressor_node.py b/examples/pipelines/experiments/regressor_node.py new file mode 100644 index 0000000..7afcbc7 --- /dev/null +++ b/examples/pipelines/experiments/regressor_node.py @@ -0,0 +1,28 @@ +from sklearn.linear_model import Ridge +from sklearn.base import RegressorMixin +from persistent_node import PersistentNode + + +class RegressorNode(PersistentNode, RegressorMixin): + def __init__(self, save_path="models/regressor.joblib"): + super().__init__(save_path) + self.model = Ridge() + + def __call__(self, X, y=None): + if y is not None: + self.fit(X, y) + return self.predict(X) + + def fit(self, X, y): + if self.load_if_cached((X, y)): + return self + self.model.fit(X, y) + self.save_model_with_hash(self.model, (X, y)) + return self + + def predict(self, X): + self.check_fitted_or_load() + return self.model.predict(X) + + def inverse_transform(self, y_pred): + return y_pred diff --git a/examples/pipelines/experiments/scaler_node.py b/examples/pipelines/experiments/scaler_node.py new file mode 100644 index 0000000..42d37ef --- /dev/null +++ b/examples/pipelines/experiments/scaler_node.py @@ -0,0 +1,33 @@ +from sklearn.preprocessing import StandardScaler +from persistent_node import PersistentNode + + +class ScalerNode(PersistentNode): + def __init__(self, save_path="models/scaler.joblib"): + super().__init__(save_path) + self.model = StandardScaler() + + def __call__(self, X): + return self.fit_transform(X) + + def fit(self, X, y=None): + if self.load_if_cached(X): + return self + self.model.fit(X) + self.save_model_with_hash(self.model, X) + return self + + def transform(self, X): + self.check_fitted_or_load() + return self.model.transform(X) + + def fit_transform(self, X, y=None): + if self.load_if_cached(X): + return self.model.transform(X) + self.model.fit(X) + self.save_model_with_hash(self.model, X) + return self.model.transform(X) + + def inverse_transform(self, X_scaled): + self.check_fitted_or_load() + return self.model.inverse_transform(X_scaled) \ No newline at end of file diff --git a/examples/pipelines/experiments/sklearn_hf_pipeline.py b/examples/pipelines/experiments/sklearn_hf_pipeline.py new file mode 100644 index 0000000..9270528 --- /dev/null +++ b/examples/pipelines/experiments/sklearn_hf_pipeline.py @@ -0,0 +1,90 @@ +from datasets import load_dataset +from plaid.containers.sample import Sample +from plaid.containers.dataset import Dataset +from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition +import os, pickle +import numpy as np +from safetensors.numpy import save_file +from sklearn.base import BaseEstimator, RegressorMixin + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from pathlib import Path +import joblib + +import yaml +import time + +from ml_pipeline_HF_nodes import ScalarScalerNode, GPRegressorNode + + +with open("config_2.yml") as f: + params = yaml.safe_load(f) + + +class HFDataset(Dataset): + def __init__(self, hf_dataset = None, path = None): + assert not (hf_dataset and path), "hf_dataset and path cannot be both initialized" + assert hf_dataset or path, "hf_dataset and path cannot be both not initialized" + if hf_dataset: + self.ds = hf_dataset + elif path: + self.ds = load_dataset(path, split="all_samples") + + def __getitem__(self, idx): + if isinstance(idx, slice): + return HFDataset(self.ds[idx]) + return Sample.model_validate(pickle.loads(self.ds[idx]["sample"])) + + def __len__(self): + return len(self.ds) + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __getattr__(self, name): + attr = getattr(self.ds, name) + if callable(attr): + def wrapper(*args, **kwargs): + result = attr(*args, **kwargs) + if isinstance(result, type(self.ds)): + return HFDataset(result) + return result + return wrapper + return attr + + +start = time.time() +dataset = HFDataset(path = params['dataset_path']) +print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") + +params['prob_def'] = huggingface_description_to_problem_definition(dataset.description) + + +pipeline = Pipeline([ + ('scalar_scaler', ScalarScalerNode(name = 'scalar_scaler', params = params)), + # ('tabular_regressor', GPRegressorNode(name = 'tabular_regressor', params = params)) +]) + +print(dataset[0].get_scalar('max_U2_top')) + + +pipeline.fit_transform(dataset) +print("pipeline fitted") + +print(dataset[0].get_scalar('max_U2_top')) + +# dataset_2 = pipeline.inverse_transform(pipeline.predict(dataset)) + +# print(dataset_2[0].get_scalar('max_U2_top')) + +# dataset_3 = pipeline.inverse_transform(dataset_2) + +# print(dataset_3[0].get_scalar('p1')) + + + +# dataset_4 = pipeline.inverse_transform(pipeline.fit_transform(dataset)) +# print(dataset_4[0].get_scalar('p1')) diff --git a/examples/pipelines/ml_pipeline_nodes.py b/examples/pipelines/ml_pipeline_nodes.py new file mode 100644 index 0000000..6f0aa37 --- /dev/null +++ b/examples/pipelines/ml_pipeline_nodes.py @@ -0,0 +1,249 @@ +import os +import json +import joblib +from joblib import Parallel, delayed +import numpy as np +from pathlib import Path +from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin +from sklearn.linear_model import Ridge +from sklearn.decomposition import PCA +from plaid.containers.dataset import Dataset +import copy + +from sklearn.preprocessing import StandardScaler, MinMaxScaler + +available_scalers = { + "StandardScaler": StandardScaler, + "MinMaxScaler": MinMaxScaler, +} + +class ScalarScalerNode(BaseEstimator, TransformerMixin): + + def __init__(self, type, scalar_names): + self.type = type + self.scalar_names = scalar_names + + assert type in available_scalers.keys(), "Scaler "+type+" not available" + + self.model = None + + + def get_scalars(self, dataset): + if isinstance(dataset, list): + dataset = Dataset.from_list_of_samples(dataset) + return dataset.get_scalars_to_tabular( + scalar_names = self.scalar_names, + as_nparray = True + ) + + def set_scalars(self, dataset, scalars): + for i in range(len(dataset)): + for j, sn in enumerate(self.scalar_names): + dataset[i].add_scalar(sn, scalars[i, j]) + + def fit(self, dataset, y=None): + self.model = available_scalers[self.type]() + + scalars = self.get_scalars(dataset) + self.model.fit(scalars) + self.fitted_ = True + return self + + def transform(self, dataset): + scalars = self.get_scalars(dataset) + scaled_scalars = self.model.transform(scalars) + dataset_ = copy.deepcopy(dataset) + self.set_scalars(dataset_, scaled_scalars) + return dataset_ + + def inverse_transform(self, dataset): + scaled_scalars = self.get_scalars(dataset) + scalars = self.model.inverse_transform(scaled_scalars) + dataset_ = copy.deepcopy(dataset) + self.set_scalars(dataset_, scalars) + return dataset_ + + +class PCAEmbeddingNode(BaseEstimator, RegressorMixin, TransformerMixin): + + def __init__(self, field_name = None, n_components = None, zone_name = None, base_name = None, time = None, location = "Vertex"): + + self.zone_name = zone_name + self.base_name = base_name + self.time = time + self.location = location + + self.field_name = field_name + self.n_components = n_components + + self.model = None + + def get_all_fields(self, dataset): + all_fields = [] + for sample in dataset: + if self.field_name == "nodes": + field = sample.get_nodes(self.zone_name, self.base_name, self.time).flatten() + else: + field = sample.get_field(self.field_name, self.zone_name, self.base_name, self.location, self.time) + all_fields.append(field) + return np.array(all_fields) + + def set_reduced_fields(self, dataset, reduced_fields): + for i in range(len(dataset)): + for j in range(self.n_components): + dataset[i].add_scalar(f"reduced_{self.field_name}_{j}", reduced_fields[i, j]) + + def get_reduced_fields(self, dataset): + if isinstance(dataset, list): + dataset = Dataset.from_list_of_samples(dataset) + return dataset.get_scalars_to_tabular( + scalar_names = [f"reduced_{self.field_name}_{j}" for j in range(self.n_components)], + as_nparray = True + ) + + def set_fields(self, dataset, fields): + for i in range(len(dataset)): + dataset[i].add_field(self.field_name, fields[i], self.zone_name, self.base_name, self.location, self.time) + + def fit(self, dataset, y=None): + self.model = PCA(n_components = self.n_components) + + all_fields = self.get_all_fields(dataset) + self.model.fit(all_fields) + self.fitted_ = True + return self + + def transform(self, dataset): + all_fields = self.get_all_fields(dataset) + reduced_fields = self.model.transform(all_fields) + dataset_ = copy.deepcopy(dataset) + self.set_reduced_fields(dataset_, reduced_fields) + return dataset_ + + def inverse_transform(self, dataset): + reduced_fields = self.get_reduced_fields(dataset) + fields = self.model.inverse_transform(reduced_fields) + dataset_ = copy.deepcopy(dataset) + self.set_fields(dataset_, fields) + return dataset_ + + +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel +from sklearn.multioutput import MultiOutputRegressor + +available_kernel_classes = { + "Matern":Matern +} + +class GPRegressorNode(BaseEstimator, RegressorMixin, TransformerMixin): + + def __init__(self, params): + + self.params = params + assert params['type'] == "GaussianProcessRegressor" + assert self.params['options']["kernel"] in available_kernel_classes.keys(), "scikit-learn kernel "+self.params['options']["kernel"]+" not available" + + self.model = None + + + def get_scalars(self, dataset): + if isinstance(dataset, list): + dataset = Dataset.from_list_of_samples(dataset) + return dataset.get_scalars_to_tabular( + scalar_names = self.input_names, + as_nparray = True + ) + + def fit(self, dataset, y=None): + if isinstance(dataset, list): + dataset = Dataset.from_list_of_samples(dataset) + all_available_scalar = dataset.get_scalar_names() + + self.input_names = [] + if "scalar_names" in self.params['input']: + self.input_names += self.params['input']["scalar_names"] + if "vector_names" in self.params['input']: + for vn in self.params['input']["vector_names"]: + self.input_names += [s for s in all_available_scalar if s.startswith(vn)] + + self.output_names = [] + if "scalar_names" in self.params['output']: + self.output_names += self.params['output']["scalar_names"] + if "vector_names" in self.params['output']: + for vn in self.params['output']["vector_names"]: + self.output_names += [s for s in all_available_scalar if s.startswith(vn)] + + kernel_class = available_kernel_classes[self.params['options']["kernel"]] + if self.params['options']["anisotropic"]: + kernel = ConstantKernel() * kernel_class(length_scale=np.ones(len(self.input_names)), length_scale_bounds=(1e-8, 1e8), + **self.params['options']["kernel_options"]) + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) + else: + kernel = kernel_class(length_scale_bounds=(1e-8, 1e8), **self.params['options']["kernel_options"]) \ + + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) + + gpr = GaussianProcessRegressor( + kernel=kernel, + optimizer=self.params['options']["optim"], + n_restarts_optimizer=self.params['options']["num_restarts"], + random_state = self.params['options']["random_state"]) + + self.model = MultiOutputRegressor(gpr) + if isinstance(dataset, list): + dataset = Dataset.from_list_of_samples(dataset) + X = dataset.get_scalars_to_tabular( + scalar_names = self.input_names, + as_nparray = True + ) + y = dataset.get_scalars_to_tabular( + scalar_names = self.output_names, + as_nparray = True + ) + self.model.fit(X, y) + + self.fitted_ = True + return self + + def predict(self, dataset): + if isinstance(dataset, list): + dataset = Dataset.from_list_of_samples(dataset) + X = dataset.get_scalars_to_tabular( + scalar_names = self.input_names, + as_nparray = True + ) + + pred= self.model.predict(X) + if len(self.output_names) == 1: + pred = pred.reshape((-1, 1)) + + dataset_ = copy.deepcopy(dataset) + for i in range(len(dataset)): + for j, sn in enumerate(self.output_names): + dataset_[i].add_scalar(sn, pred[i, j]) + + return dataset_ + + def transform(self, dataset): + return dataset + + def inverse_transform(self, dataset): + return dataset + + def score(self, dataset, dataset_ref): + if not dataset_ref: + # case where GirdSearchCV is called with only one argument search.fit(dataset) + dataset_ref = dataset + if isinstance(dataset, list): + dataset = Dataset.from_list_of_samples(dataset) + X = dataset.get_scalars_to_tabular( + scalar_names = self.input_names, + as_nparray = True + ) + if isinstance(dataset_ref, list): + dataset_ref = Dataset.from_list_of_samples(dataset_ref) + y = dataset_ref.get_scalars_to_tabular( + scalar_names = self.output_names, + as_nparray = True + ) + return self.model.score(X, y) + diff --git a/examples/pipelines/sklearn_pipeline.py b/examples/pipelines/sklearn_pipeline.py new file mode 100644 index 0000000..8fbcb04 --- /dev/null +++ b/examples/pipelines/sklearn_pipeline.py @@ -0,0 +1,103 @@ +from datasets import load_dataset +from plaid.containers.sample import Sample +from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition +import os, pickle +import numpy as np +from safetensors.numpy import save_file +from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.model_selection import GridSearchCV + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from pathlib import Path +import joblib +from copy import copy + +import yaml +import time + +from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode #, TutteMorphing + +import warnings +warnings.filterwarnings('ignore', module='sklearn') + + +with open("config.yml") as f: + config = yaml.safe_load(f) + +global_params = config["global"] + + +start = time.time() +hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples") +print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") + +prob_def = huggingface_description_to_problem_definition(hf_dataset.description) + +train_split = prob_def.get_split(global_params['train_split_name'])[:100] +dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split) + + +pipeline = Pipeline([ + ('input_scalar_scaler', ScalarScalerNode(type = config['input_scalar_scaler']['type'], scalar_names = config['input_scalar_scaler']['scalar_names'])), + ('output_scalar_scaler', ScalarScalerNode(type = config['output_scalar_scaler']['type'], scalar_names = config['output_scalar_scaler']['scalar_names'])), + ('pca_nodes', PCAEmbeddingNode(field_name = config['pca_nodes']['field_name'], n_components = config['pca_nodes']['n_components'], base_name = config['pca_nodes']['base_name'])), + ('pca_mach', PCAEmbeddingNode(field_name = config['pca_mach']['field_name'], n_components = config['pca_mach']['n_components'], base_name = config['pca_mach']['base_name'])), + ('regressor_mach', GPRegressorNode(params = config['regressor_mach'])) +]) + + + +print("=================================") +print("GridSearchCV example:") +# print("pipeline parameters=", pipeline.get_params(deep=True)) +# print("pipeline parameters=", pipeline.get_params().keys()) +# print("Pipeline steps:", pipeline.steps) + + +param_grid = { + 'pca_nodes__n_components': [2, 3], + 'pca_mach__n_components': [4, 5], +} + +# Run GridSearchCV +search = GridSearchCV(pipeline, param_grid=param_grid, cv=3, verbose=3) +search.fit(dataset_train) + +# Results +print("Best parameters:", search.best_params_) +print("Best score:", search.best_score_) + + + +print("=================================") +print("Direct pipeline example:") + +train_split = prob_def.get_split(global_params['train_split_name']) +test_split = prob_def.get_split(global_params['train_split_name'])[:10] + +dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split) +dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split) + + + +pipeline.fit(dataset_train) +print("pipeline fitted") + +dataset_test_transformed = pipeline.transform(dataset_test) + +print("score =", pipeline.score(dataset_test, dataset_test_transformed)) + + +dataset_test_pred = pipeline.inverse_transform(pipeline.predict(dataset_test)) + +import shutil + +if os.path.exists(os.path.join(global_params['save_path'], "dataset_test")): + shutil.rmtree(os.path.join(global_params['save_path'], "dataset_test")) +dataset_test._save_to_dir_(os.path.join(global_params['save_path'], "dataset_test"), verbose = True) + +if os.path.exists(os.path.join(global_params['save_path'], "dataset_test_pred")): + shutil.rmtree(os.path.join(global_params['save_path'], "dataset_test_pred")) +dataset_test_pred._save_to_dir_(os.path.join(global_params['save_path'], "dataset_test_pred"), verbose = True) diff --git a/examples/pipeline/config_2.yml b/examples/pipelines/with_persistence/config_2.yml similarity index 85% rename from examples/pipeline/config_2.yml rename to examples/pipelines/with_persistence/config_2.yml index b6c60a5..10d870a 100644 --- a/examples/pipeline/config_2.yml +++ b/examples/pipelines/with_persistence/config_2.yml @@ -1,7 +1,8 @@ -dataset_path: "PLAID-datasets/VKI-LS59" -save_path: "./artifacts" -train_split_name: "train" -test_split_name: "test" +global: + dataset_path: "PLAID-datasets/VKI-LS59" + save_path: "./artifacts" + train_split_name: "train" + test_split_name: "test" input_scalar_scaler: type: "MinMaxScaler" @@ -57,4 +58,5 @@ tabular_regressor: - reduced_mach_0 - reduced_mach_1 - reduced_mach_2 + - reduced_mach_3 - reduced_mach_4 \ No newline at end of file diff --git a/examples/pipeline/ml_pipeline_nodes.py b/examples/pipelines/with_persistence/ml_pipeline_nodes.py similarity index 80% rename from examples/pipeline/ml_pipeline_nodes.py rename to examples/pipelines/with_persistence/ml_pipeline_nodes.py index 429e90f..c0575b6 100644 --- a/examples/pipeline/ml_pipeline_nodes.py +++ b/examples/pipelines/with_persistence/ml_pipeline_nodes.py @@ -9,12 +9,55 @@ from sklearn.decomposition import PCA +# def flatten_dict(d, prefix=''): +# items = {} +# for k, v in d.items(): +# new_key = f'{prefix}__{k}' if prefix else k +# if isinstance(v, dict): +# items.update(flatten_dict(v, new_key)) +# else: +# items[f'param__{new_key}'] = v +# return items + +# def unflatten_dict(flat): +# nested = {} +# for flat_key, value in flat.items(): +# keys = flat_key.split('__') +# d = nested +# for k in keys[:-1]: +# d = d.setdefault(k, {}) +# d[keys[-1]] = value +# return nested + + class PersistentNode(BaseEstimator, RegressorMixin, TransformerMixin): - def __init__(self, save_path): - self.save_path = Path(save_path) + def __init__(self, name, global_params, params): + self.name = name + self.global_params = global_params + self.params = params + self.save_path = Path(os.path.join(global_params['save_path'], f"{name}.joblib")) self.fitted_ = False self.model = None + # def get_params(self, deep=True): + # return flatten_dict(self.params[self.name]) + + # def set_params(self, **kwargs): + # if 'name' in kwargs: + # self.name = kwargs.pop('name') + # if 'params' in kwargs: + # self.params.update(kwargs.pop('params')) + + # # Extract param__ keys and merge into self.params + # flat = {} + # for k, v in kwargs.items(): + # if k.startswith('param__'): + # flat_key = k[len('param__'):] + # flat[flat_key] = v + # nested_update = unflatten_dict(flat) + # self.params = self._deep_merge(self.params, nested_update) + # return self + def save(self, obj): self.save_path.parent.mkdir(parents=True, exist_ok=True) joblib.dump(obj, self.save_path) @@ -91,10 +134,10 @@ def _score(self, X, y): class ScalarScalerNode(PersistentNode): - def __init__(self, name, params): - super().__init__(os.path.join(params['save_path'], f"{name}.joblib")) - self.scalar_names = params[name]['scalar_names'] - self.model = available_scalers[params[name]['type']]() + def __init__(self, name, global_params, params): + super().__init__(name, global_params, params) + self.scalar_names = params['scalar_names'] + self.model = available_scalers[params['type']]() def get_scalars(self, dataset): return dataset.get_scalars_to_tabular( @@ -128,19 +171,22 @@ def _inverse_transform(self, dataset): class PCAEmbeddingNode(PersistentNode): - def __init__(self, name:str, params:dict): - super().__init__(os.path.join(params['save_path'], f"{name}.joblib")) + def __init__(self, name, global_params, params): + super().__init__(name, global_params, params) - self.zone_name = params[name]["zone_name"] if "zone_name" in params[name] else None - self.base_name = params[name]["base_name"] if "base_name" in params[name] else None - self.time = params[name]["time"] if "time" in params[name] else None - self.location = params[name]["location"] if "location" in params[name] else "Vertex" + self.zone_name = params["zone_name"] if "zone_name" in params else None + self.base_name = params["base_name"] if "base_name" in params else None + self.time = params["time"] if "time" in params else None + self.location = params["location"] if "location" in params else "Vertex" - assert params[name]['type'] == "PCA" - self.n_components = params[name]['n_components'] + assert params['type'] == "PCA" + self.n_components = params['n_components'] self.field_names = list(self.n_components.keys()) self.model = {name: PCA(n_components = nc) for name, nc in self.n_components.items()} + def get_params(self, deep=True): + return {f"n_components__{key}":val for key, val in self.n_components.items()} + def get_all_fields(self, dataset, fn): all_fields = [] for sample in dataset: @@ -206,7 +252,7 @@ def _inverse_transform(self, dataset): # class TutteMorphing(PersistentNode): # def __init__(self, name, params): -# super().__init__(os.path.join(params['save_path'], f"{name}.joblib")) +# super().__init__(params['save_path'], name) # self.prob_def = params['prob_def'] # self.loc_params = params[name] # self.model = {} @@ -261,10 +307,10 @@ def _inverse_transform(self, dataset): class GPRegressorNode(PersistentNode): - def __init__(self, name, params): + def __init__(self, name, global_params, params): + super().__init__(name, global_params, params) - super().__init__(os.path.join(params['save_path'], f"{name}.joblib")) - self.loc_params = params[name] + self.loc_params = params assert self.loc_params['type'] == "GaussianProcessRegressor" options = self.loc_params['options'] @@ -343,7 +389,8 @@ def _score(self, dataset, dataset_ref): class ScalerNode(PersistentNode): def __init__(self, name, save_path): - super().__init__(os.path.join(save_path, f"{name}.joblib")) + super().__init__(save_path, name) + self.model = StandardScaler() def _fit(self, X, y=None): @@ -361,7 +408,7 @@ def _predict(self, X): class RegressorNode(PersistentNode): def __init__(self, name, save_path, alpha=1.0): - super().__init__(os.path.join(save_path, f"{name}.joblib")) + super().__init__(save_path, name) self.model = Ridge(alpha=alpha) def _fit(self, X, y): diff --git a/examples/pipeline/sklearn_pipeline.py b/examples/pipelines/with_persistence/sklearn_pipeline.py similarity index 56% rename from examples/pipeline/sklearn_pipeline.py rename to examples/pipelines/with_persistence/sklearn_pipeline.py index 2c91bad..bdefffe 100644 --- a/examples/pipeline/sklearn_pipeline.py +++ b/examples/pipelines/with_persistence/sklearn_pipeline.py @@ -19,64 +19,56 @@ with open("config_2.yml") as f: - params = yaml.safe_load(f) + config = yaml.safe_load(f) + +global_params = config["global"] start = time.time() -hf_dataset = load_dataset(params['dataset_path'], split="all_samples") +hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples") print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") prob_def = huggingface_description_to_problem_definition(hf_dataset.description) -train_split = prob_def.get_split(params['train_split_name'])[:20] -test_split = prob_def.get_split(params['train_split_name'])[:20] +ref_split = prob_def.get_split(global_params['train_split_name'])[:20] +train_split = prob_def.get_split(global_params['train_split_name'])[10:20] +test_split = prob_def.get_split(global_params['train_split_name'])[:20] +dataset_ref, _ = huggingface_dataset_to_plaid(hf_dataset, ids = ref_split) dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split) dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split) + pipeline = Pipeline([ - ('input_scalar_scaler', ScalarScalerNode(name = 'input_scalar_scaler', params = params)), - ('output_scalar_scaler', ScalarScalerNode(name = 'output_scalar_scaler', params = params)), - ('pca_shape_embedding', PCAEmbeddingNode(name = 'pca_shape_embedding', params = params)), - ('pca_field_embedding', PCAEmbeddingNode(name = 'pca_field_embedding', params = params)), - ('tabular_regressor', GPRegressorNode(name = 'tabular_regressor', params = params)) + ('input_scalar_scaler', ScalarScalerNode(name = 'input_scalar_scaler', global_params = global_params, params = config['input_scalar_scaler'])), + ('output_scalar_scaler', ScalarScalerNode(name = 'output_scalar_scaler', global_params = global_params, params = config['output_scalar_scaler'])), + ('pca_shape_embedding', PCAEmbeddingNode(name = 'pca_shape_embedding', global_params = global_params, params = config['pca_shape_embedding'])), + ('pca_field_embedding', PCAEmbeddingNode(name = 'pca_field_embedding', global_params = global_params, params = config['pca_field_embedding'])), + ('tabular_regressor', GPRegressorNode(name = 'tabular_regressor', global_params = global_params, params = config['tabular_regressor'])) ]) +print("pipeline parameters=", pipeline.get_params(deep=True)) + +1./0. + ind = train_split[0] pipeline.fit(dataset_train) print("pipeline fitted") -dataset_test_2 = pipeline.predict(dataset_test) +dataset_ref = pipeline.transform(dataset_ref) -print("score =", pipeline.score(dataset_test, dataset_test_2)) +dataset_test_2 = pipeline.inverse_transform(pipeline.predict(dataset_ref)) -print(dataset_test_2) - -# dataset_test._save_to_dir_(os.path.join(params['save_path'], "dataset_test"), verbose = True) -# dataset_test_2._save_to_dir_(os.path.join(params['save_path'], "dataset_test_2"), verbose = True) +print("score =", pipeline.score(dataset_test_2, dataset_ref)) -from Muscat.Bridges.CGNSBridge import CGNSToMesh, MeshToCGNS -from Muscat.IO.XdmfWriter import WriteMeshToXdmf - -mesh1 = CGNSToMesh(dataset_test[0].get_mesh(), baseNames = ["Base_2_2"]) -mesh2 = CGNSToMesh(dataset_test_2[0].get_mesh(), baseNames = ["Base_2_2"]) +print(dataset_test_2) -print(mesh1) -print(mesh2) +dataset_ref._save_to_dir_(os.path.join(params['save_path'], "dataset_ref"), verbose = True) +dataset_test_2._save_to_dir_(os.path.join(params['save_path'], "dataset_test_2"), verbose = True) -WriteMeshToXdmf(os.path.join(params['save_path'], "mesh1.xdmf"), - mesh1, - PointFields = [dataset_test[0].get_field(fn, base_name = "Base_2_2") for fn in ['mach', 'nut']], - PointFieldsNames = ['mach', 'nut'] -) -WriteMeshToXdmf(os.path.join(params['save_path'], "mesh2.xdmf"), - mesh2, - PointFields = [dataset_test_2[0].get_field(fn, base_name = "Base_2_2") for fn in ['mach', 'nut']], - PointFieldsNames = ['mach', 'nut'] -) # print(dataset_train[ind]) # print(dataset_train[ind].get_scalar_names()) diff --git a/src/plaid/containers/dataset.py b/src/plaid/containers/dataset.py index c075146..972127d 100644 --- a/src/plaid/containers/dataset.py +++ b/src/plaid/containers/dataset.py @@ -697,6 +697,20 @@ def save(self, fname: Union[str, Path]) -> None: # Finally : removes directory shutil.rmtree(savedir) + @classmethod + def from_list_of_samples(cls, list_of_samples: list[Sample]) -> Self: + """Initialise a dataset from a list of samples. + + Args: + list_of_samples (list[Sample]): The list of samples. + + Returns: + Self: The intialized dataset (Dataset). + """ + instance = cls() + instance.add_samples(list_of_samples) + return instance + @classmethod def load_from_file( cls, fname: Union[str, Path], verbose: bool = False, processes_number: int = 0 @@ -1025,17 +1039,19 @@ def __len__(self) -> int: """ return len(self._samples) - def __getitem__(self, id: int) -> Sample: + def __getitem__( + self, id: Union[int, slice, list[int], np.ndarray] + ) -> Union[Sample, Self]: """Retrieve a specific sample by its ID int this dataset. Args: - id (int): The ID of the sample to retrieve. + id (Union[int, slice, list[int], np.ndarray]): The ID(s) of the sample to retrieve. Raises: IndexError: If the provided ID is out of bounds or does not exist in the dataset. Returns: - Sample: The sample with the specified ID. + Union[Sample, Dataset]: The sample with the specified ID or a dataset in the specified IDs. Example: .. code-block:: python @@ -1047,12 +1063,21 @@ def __getitem__(self, id: int) -> Sample: Seealso: This function can also be called using `__call__()`. """ - if id in self._samples: - return self._samples[id] + if isinstance(id, slice) or isinstance(id, list) or isinstance(id, np.ndarray): + if isinstance(id, slice): + id = list(range(*id.indices(len(self)))) + dataset = Dataset() + for i in id: + dataset.add_sample(self[i], i) + dataset.set_infos(self.get_infos()) + return dataset else: - raise IndexError( - f"sample with {id=} not set -> use 'Dataset.add_sample' or 'Dataset.add_samples'" - ) + if id in self._samples: + return self._samples[id] + else: + raise IndexError( + f"sample with {id=} not set -> use 'Dataset.add_sample' or 'Dataset.add_samples'" + ) __call__ = __getitem__ From 11b9c2e97c889c225fc03892b9f8158bc9e33a22 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Mon, 30 Jun 2025 10:36:42 +0200 Subject: [PATCH 07/19] fix(ruff formatting) --- src/plaid/containers/dataset.py | 3 ++- tests/containers/test_dataset.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/plaid/containers/dataset.py b/src/plaid/containers/dataset.py index 972127d..d1abc1f 100644 --- a/src/plaid/containers/dataset.py +++ b/src/plaid/containers/dataset.py @@ -1064,11 +1064,12 @@ def __getitem__( This function can also be called using `__call__()`. """ if isinstance(id, slice) or isinstance(id, list) or isinstance(id, np.ndarray): + print(">>>", type(id)) if isinstance(id, slice): id = list(range(*id.indices(len(self)))) dataset = Dataset() for i in id: - dataset.add_sample(self[i], i) + dataset.add_sample(self[int(i)], int(i)) dataset.set_infos(self.get_infos()) return dataset else: diff --git a/tests/containers/test_dataset.py b/tests/containers/test_dataset.py index d13c4fb..b01a39d 100644 --- a/tests/containers/test_dataset.py +++ b/tests/containers/test_dataset.py @@ -417,6 +417,12 @@ def test_merge_dataset_with_bad_type(self, dataset_with_samples): # -------------------------------------------------------------------------# + def test_from_list_of_samples(self, samples): + loaded_dataset = Dataset.from_list_of_samples(samples) + assert len(loaded_dataset) == len(samples) + + # -------------------------------------------------------------------------# + def test_save(self, dataset_with_samples, tmp_path): fname = tmp_path / "test.plaid" dataset_with_samples.save(fname) @@ -551,7 +557,10 @@ def test___getitem__empty(self, dataset): dataset[0] def test___getitem__(self, dataset_with_samples, nb_samples): - dataset_with_samples[np.random.randint(nb_samples)] + assert isinstance(dataset_with_samples[np.random.randint(nb_samples)], Sample) + assert isinstance(dataset_with_samples[1 : nb_samples - 1], Dataset) + assert isinstance(dataset_with_samples[np.arange(1, nb_samples - 1)], Dataset) + assert isinstance(dataset_with_samples[list(range(1, nb_samples - 1))], Dataset) def test___call__empty(self, dataset): with pytest.raises(IndexError): From 5ec1d0190882081fdb7ea66abbd92f9289811044 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Mon, 30 Jun 2025 13:48:38 +0200 Subject: [PATCH 08/19] feat(tests) reduce samples list in autotests to speedup pytest runs --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 12254f9..8806eae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,7 +38,7 @@ def generate_samples(nb: int, zone_name: str, base_name: str) -> list[Sample]: @pytest.fixture() def nb_samples() -> int: """Number of samples to generate for tests.""" - return 11 + return 4 @pytest.fixture() From 02cf15df8c25fb8a3bd11f12899ac4c31ce436f6 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Mon, 30 Jun 2025 13:50:13 +0200 Subject: [PATCH 09/19] feat(tests) improve coverage of huggingface_bridge --- tests/bridges/test_huggingface_bridge.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/bridges/test_huggingface_bridge.py b/tests/bridges/test_huggingface_bridge.py index 3e6fa55..472425d 100644 --- a/tests/bridges/test_huggingface_bridge.py +++ b/tests/bridges/test_huggingface_bridge.py @@ -22,7 +22,7 @@ @pytest.fixture() def dataset(samples, infos) -> Dataset: dataset = Dataset() - dataset.add_samples(samples) + dataset.add_samples(samples[:2]) dataset.set_infos(infos) return dataset @@ -90,6 +90,14 @@ def test_huggingface_dataset_to_plaid(self, hf_dataset): ds, pbdef = huggingface_bridge.huggingface_dataset_to_plaid(hf_dataset) self.assert_plaid_dataset(ds, pbdef) + def test_huggingface_dataset_to_plaid_with_ids(self, hf_dataset): + huggingface_bridge.huggingface_dataset_to_plaid(hf_dataset, ids=[0, 1]) + + def test_huggingface_dataset_to_plaid_large(self, hf_dataset): + huggingface_bridge.huggingface_dataset_to_plaid( + hf_dataset, processes_number=2, large_dataset=True + ) + def test_create_string_for_huggingface_dataset_card(self, hf_dataset): huggingface_bridge.create_string_for_huggingface_dataset_card( description=hf_dataset.description, From c449a924f662eaffb58b286c0f28de2304bbacf0 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Mon, 30 Jun 2025 14:21:34 +0200 Subject: [PATCH 10/19] feat(huggingface_bridge) improve coverage --- src/plaid/bridges/huggingface_bridge.py | 14 +++++++++++--- tests/bridges/test_huggingface_bridge.py | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/plaid/bridges/huggingface_bridge.py b/src/plaid/bridges/huggingface_bridge.py index 26e1ff0..4a46ae2 100644 --- a/src/plaid/bridges/huggingface_bridge.py +++ b/src/plaid/bridges/huggingface_bridge.py @@ -103,7 +103,7 @@ def plaid_generator_to_huggingface( generator: Callable, infos: dict, problem_definition: ProblemDefinition, - processes_number: int = os.cpu_count(), + processes_number: int = 1, ) -> datasets.Dataset: """Use this function for creating a huggingface dataset from a sample generator function. @@ -193,7 +193,7 @@ def __call__(self, idx): def huggingface_dataset_to_plaid( ds: datasets.Dataset, ids: list[int] = None, - processes_number: int = os.cpu_count(), + processes_number: int = 1, large_dataset=False, ) -> tuple[Self, ProblemDefinition]: """Use this function for converting a plaid dataset from a huggingface dataset. @@ -221,13 +221,21 @@ def huggingface_dataset_to_plaid( dataset = load_from_disk("chanel/dataset") plaid_dataset, plaid_problem = huggingface_dataset_to_plaid(dataset) """ + assert processes_number <= len(ds), ( + "Trying to parallelize with more processes than samples in dataset" + ) + if ids: + assert processes_number <= len(ids), ( + "Trying to parallelize with more processes than selected samples in dataset" + ) + dataset = Dataset() print("Converting huggingface dataset to plaid dataset...") if large_dataset: if ids: - NotImplementedError( + raise NotImplementedError( "ids selection not implemented with large_dataset option" ) for i in range(processes_number): diff --git a/tests/bridges/test_huggingface_bridge.py b/tests/bridges/test_huggingface_bridge.py index 472425d..d54161c 100644 --- a/tests/bridges/test_huggingface_bridge.py +++ b/tests/bridges/test_huggingface_bridge.py @@ -98,6 +98,24 @@ def test_huggingface_dataset_to_plaid_large(self, hf_dataset): hf_dataset, processes_number=2, large_dataset=True ) + def test_huggingface_dataset_to_plaid_with_ids_large(self, hf_dataset): + with pytest.raises(NotImplementedError): + huggingface_bridge.huggingface_dataset_to_plaid( + hf_dataset, ids=[0, 1], processes_number=2, large_dataset=True + ) + + def test_huggingface_dataset_to_plaid_error_processes_number(self, hf_dataset): + with pytest.raises(AssertionError): + huggingface_bridge.huggingface_dataset_to_plaid( + hf_dataset, processes_number=128 + ) + + def test_huggingface_dataset_to_plaid_error_processes_number_2(self, hf_dataset): + with pytest.raises(AssertionError): + huggingface_bridge.huggingface_dataset_to_plaid( + hf_dataset, ids=[0], processes_number=2 + ) + def test_create_string_for_huggingface_dataset_card(self, hf_dataset): huggingface_bridge.create_string_for_huggingface_dataset_card( description=hf_dataset.description, From aadbc61fdca2736e94f965e2ba91c854ef3b735a Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Mon, 30 Jun 2025 14:24:49 +0200 Subject: [PATCH 11/19] feat(examples/pipelines) remove wip files --- examples/pipelines/experiments/config_0.yml | 9 - .../pipelines/experiments/persistent_node.py | 448 ------------------ examples/pipelines/experiments/pipefunc_ex.py | 67 --- .../experiments/pipefunc_tensile2d.py | 132 ------ .../experiments/pipefunc_tensile2d_2.py | 104 ---- .../pipelines/experiments/regressor_node.py | 28 -- examples/pipelines/experiments/scaler_node.py | 33 -- .../experiments/sklearn_hf_pipeline.py | 90 ---- .../pipelines/with_persistence/config_2.yml | 62 --- .../with_persistence/ml_pipeline_nodes.py | 424 ----------------- .../with_persistence/sklearn_pipeline.py | 92 ---- 11 files changed, 1489 deletions(-) delete mode 100644 examples/pipelines/experiments/config_0.yml delete mode 100644 examples/pipelines/experiments/persistent_node.py delete mode 100644 examples/pipelines/experiments/pipefunc_ex.py delete mode 100644 examples/pipelines/experiments/pipefunc_tensile2d.py delete mode 100644 examples/pipelines/experiments/pipefunc_tensile2d_2.py delete mode 100644 examples/pipelines/experiments/regressor_node.py delete mode 100644 examples/pipelines/experiments/scaler_node.py delete mode 100644 examples/pipelines/experiments/sklearn_hf_pipeline.py delete mode 100644 examples/pipelines/with_persistence/config_2.yml delete mode 100644 examples/pipelines/with_persistence/ml_pipeline_nodes.py delete mode 100644 examples/pipelines/with_persistence/sklearn_pipeline.py diff --git a/examples/pipelines/experiments/config_0.yml b/examples/pipelines/experiments/config_0.yml deleted file mode 100644 index 7348e7a..0000000 --- a/examples/pipelines/experiments/config_0.yml +++ /dev/null @@ -1,9 +0,0 @@ -load_hf_from_hub: - - path: "PLAID-datasets/Tensile2d" - -scale_scalars: - - train_split_name: "train_500" - - test_split_name: "test" - -save: - - out_path: "./artifacts" \ No newline at end of file diff --git a/examples/pipelines/experiments/persistent_node.py b/examples/pipelines/experiments/persistent_node.py deleted file mode 100644 index a4e9651..0000000 --- a/examples/pipelines/experiments/persistent_node.py +++ /dev/null @@ -1,448 +0,0 @@ -from sklearn.base import BaseEstimator, RegressorMixin -import joblib -import json -from pathlib import Path -import hashlib -import numpy as np - - -class PersistentNode(BaseEstimator): - def __init__(self, save_path="models/unnamed.joblib"): - self.save_path = save_path - self._path = Path(save_path) - self._meta_path = self._path.with_suffix('.meta.json') - self.fitted_ = False - self.model = None - - def _save_file(self, obj, input_hash=None): - self._path.parent.mkdir(parents=True, exist_ok=True) - joblib.dump(obj, self._path) - if input_hash is not None: - self._meta_path.write_text(json.dumps({'hash': input_hash})) - self.fitted_ = True - - def save_model_with_hash(self, obj, data): - input_hash = self.hash_input(data) - self._save_file(obj, input_hash=input_hash) - - def load(self): - obj = joblib.load(self._path) - self.fitted_ = True - return obj - - def exists(self): - return self._path.exists() and self._meta_path.exists() - - def get_stored_hash(self): - if self._meta_path.exists(): - return json.loads(self._meta_path.read_text()).get('hash') - return None - - def hash_input(self, data): - m = hashlib.sha256() - if isinstance(data, tuple): - for arr in data: - m.update(np.ascontiguousarray(arr).data) - else: - m.update(np.ascontiguousarray(data).data) - return m.hexdigest() - - def check_fitted_or_load(self): - if self.fitted_: - return - if self.exists(): - loaded = self.load() - self.set_model(loaded) - else: - raise ValueError("Model not fitted or not saved.") - - def set_model(self, obj): - self.model = obj - - def get_model(self): - return self.model - - def load_if_cached(self, data): - input_hash = self.hash_input(data) - if self.exists() and self.get_stored_hash() == input_hash: - self.set_model(self.load()) - return True - return False - - def get_params(self, deep=True): - params = super().get_params(deep=deep) - if deep and hasattr(self.model, "get_params"): - for k, v in self.model.get_params(deep=True).items(): - params[f"model__{k}"] = v - return params - - def set_params(self, **params): - model_params = {k[7:]: v for k, v in params.items() if k.startswith("model__")} - node_params = {k: v for k, v in params.items() if not k.startswith("model__")} - if node_params: - super().set_params(**node_params) - if model_params and self.model is not None: - self.model.set_params(**model_params) - return self - - - -from sklearn.base import BaseEstimator, RegressorMixin -from sklearn.model_selection import GridSearchCV -from sklearn.datasets import make_regression -from sklearn.metrics import mean_squared_error -from pipefunc import pipefunc, Pipeline -import numpy as np -import inspect -from typing import Dict, Any, Optional - -class PipefuncSklearnWrapper(BaseEstimator, RegressorMixin): - """ - Scikit-learn compatible wrapper for pipefunc Pipeline objects. - - This wrapper allows pipefunc pipelines to be used with scikit-learn tools - like GridSearchCV, cross_val_score, etc. - """ - - def __init__(self, pipeline: Pipeline, prediction_output: str = "predictions", **kwargs): - """ - Initialize the wrapper. - - Parameters: - ----------- - pipeline : pipefunc.Pipeline - The pipefunc pipeline to wrap - prediction_output : str - Name of the pipeline output that contains predictions - **kwargs : dict - Additional parameters that can be tuned via GridSearchCV - """ - self.pipeline = pipeline - self.prediction_output = prediction_output - - # Store additional parameters - for key, value in kwargs.items(): - setattr(self, key, value) - - # Keep track of parameter names for get_params/set_params - self._param_names = set(kwargs.keys()) - self._param_names.add('pipeline') - self._param_names.add('prediction_output') - - # Training data storage - self.X_train_ = None - self.y_train_ = None - self.is_fitted_ = False - - def _rebuild_pipeline_with_params(self) -> Pipeline: - """ - Rebuild the pipeline with current parameters. - - This method attempts to inject current parameters into pipeline functions - that accept them as arguments. - """ - # For now, return the original pipeline - # In a more sophisticated implementation, you would: - # 1. Extract functions from the original pipeline - # 2. Create new functions with updated parameters - # 3. Rebuild the pipeline - return self.pipeline - - def _validate_pipeline_output(self, result): - """Validate that the pipeline returns the expected output.""" - if isinstance(result, dict): - if self.prediction_output not in result: - available_outputs = list(result.keys()) - raise ValueError( - f"Pipeline output '{self.prediction_output}' not found. " - f"Available outputs: {available_outputs}" - ) - return result[self.prediction_output] - else: - # Assume the result is the direct prediction - return result - - def fit(self, X, y): - """ - Fit the pipeline. - - Parameters: - ----------- - X : array-like of shape (n_samples, n_features) - Training data - y : array-like of shape (n_samples,) - Target values - - Returns: - -------- - self : object - Returns self for method chaining - """ - # Store training data - self.X_train_ = X.copy() if hasattr(X, 'copy') else np.array(X) - self.y_train_ = y.copy() if hasattr(y, 'copy') else np.array(y) - - # Rebuild pipeline with current parameters - self.pipeline_ = self._rebuild_pipeline_with_params() - - # Mark as fitted - self.is_fitted_ = True - - return self - - def predict(self, X): - """ - Make predictions using the fitted pipeline. - - Parameters: - ----------- - X : array-like of shape (n_samples, n_features) - Input data to predict on - - Returns: - -------- - predictions : array-like of shape (n_samples,) - Predicted values - """ - if not self.is_fitted_: - raise ValueError("This PipefuncSklearnWrapper instance is not fitted yet.") - - try: - # Execute the pipeline with training data and test data - # This assumes the pipeline expects both training and test data - result = self.pipeline_( - X=self.X_train_, - y=self.y_train_, - X_test=X - ) - - # Extract predictions from result - predictions = self._validate_pipeline_output(result) - - return predictions - - except Exception as e: - # Try alternative calling patterns - try: - # Maybe the pipeline expects different argument names - result = self.pipeline_( - X_train=self.X_train_, - y_train=self.y_train_, - X_test=X - ) - return self._validate_pipeline_output(result) - except: - # Try calling with specific output name - try: - result = self.pipeline_( - self.prediction_output, - X=self.X_train_, - y=self.y_train_, - X_test=X - ) - return result - except: - raise RuntimeError( - f"Failed to execute pipeline for prediction. " - f"Original error: {str(e)}" - ) - - def score(self, X, y): - """ - Return the coefficient of determination R^2 of the prediction. - - Parameters: - ----------- - X : array-like of shape (n_samples, n_features) - Test samples - y : array-like of shape (n_samples,) - True values for X - - Returns: - -------- - score : float - R^2 score - """ - from sklearn.metrics import r2_score - - predictions = self.predict(X) - return r2_score(y, predictions) - - def get_params(self, deep=True): - """ - Get parameters for this estimator. - - Parameters: - ----------- - deep : bool, default=True - If True, will return the parameters for this estimator and - contained subobjects that are estimators. - - Returns: - -------- - params : dict - Parameter names mapped to their values - """ - params = {} - - # Get all stored parameters - for param_name in self._param_names: - if hasattr(self, param_name): - params[param_name] = getattr(self, param_name) - - # If deep=True, try to extract parameters from pipeline functions - if deep and hasattr(self, 'pipeline') and self.pipeline is not None: - try: - # Try to access pipefunc internal structure - if hasattr(self.pipeline, 'graph'): - # NetworkX graph-based approach - for node_name in self.pipeline.graph.nodes(): - node_data = self.pipeline.graph.nodes[node_name] - if 'func' in node_data: - func = node_data['func'] - # Extract function parameters - self._extract_function_params(func, node_name, params) - except Exception: - # If we can't extract deep parameters, continue silently - pass - - return params - - def _extract_function_params(self, func, node_name, params): - """Extract parameters from a pipeline function.""" - try: - # Get the original function if it's wrapped - original_func = func - if hasattr(func, '__wrapped__'): - original_func = func.__wrapped__ - - # Get function signature - sig = inspect.signature(original_func) - - # Extract parameters with defaults - for param_name, param in sig.parameters.items(): - if param.default is not inspect.Parameter.empty: - prefixed_name = f"{node_name}__{param_name}" - params[prefixed_name] = param.default - except Exception: - # If parameter extraction fails, continue silently - pass - - def set_params(self, **params): - """ - Set the parameters of this estimator. - - Parameters: - ----------- - **params : dict - Estimator parameters - - Returns: - -------- - self : object - Estimator instance - """ - valid_params = self.get_params(deep=True) - - for key, value in params.items(): - if key in valid_params: - setattr(self, key, value) - # Add to tracked parameters if not already there - if key not in self._param_names: - self._param_names.add(key) - else: - raise ValueError( - f"Invalid parameter {key} for estimator {self.__class__.__name__}. " - f"Valid parameters are: {sorted(valid_params.keys())}" - ) - - # Reset fitted state if parameters changed - self.is_fitted_ = False - - return self - - - - -# class PipefuncWrapper(BaseEstimator, RegressorMixin): -# def __init__(self, pipe=None): -# self.pipe = pipe - -# def fit(self, X, y): -# self.pipe.clear() # make sure it's fresh -# self.pipe.map({ -# "X": X, -# "y": y -# }) -# return self - -# def predict(self, X): -# # Replace inputs, remove cached results downstream -# self.pipe.clear() -# self.pipe.map({ -# "X": X -# }) -# return self.pipe["y_pred"] - -# def get_params(self, deep=True): -# """Get parameters for this estimator""" -# params = {} - -# # Get all parameters stored as instance attributes -# for param_name in self._param_names: -# if hasattr(self, param_name): -# params[param_name] = getattr(self, param_name) - -# # If deep=True and we have a pipeline, try to get function parameters -# if deep and self.pipe is not None: -# try: -# # pipefunc stores functions in a different way -# # Access the internal graph structure -# if hasattr(self.pipe, 'graph'): -# for node_name in self.pipe.graph.nodes(): -# node_data = self.pipe.graph.nodes[node_name] -# if 'func' in node_data: -# func = node_data['func'] -# # Get function signature parameters -# if hasattr(func, '__wrapped__'): # For decorated functions -# func = func.__wrapped__ -# sig = inspect.signature(func) -# for param_name, param in sig.parameters.items(): -# if param.default is not inspect.Parameter.empty: -# params[f"{node_name}__{param_name}"] = param.default -# elif hasattr(self.pipe, '_functions'): -# # Alternative access pattern -# for func in self.pipe._functions: -# func_name = getattr(func, 'output_name', func.__name__) -# if hasattr(func, '__wrapped__'): -# original_func = func.__wrapped__ -# else: -# original_func = func -# sig = inspect.signature(original_func) -# for param_name, param in sig.parameters.items(): -# if param.default is not inspect.Parameter.empty: -# params[f"{func_name}__{param_name}"] = param.default -# except Exception as e: -# # If we can't extract deep parameters, just continue -# # This ensures compatibility even if pipefunc internals change -# pass - -# return params - -# def set_params(self, **params): -# """Set parameters for this estimator""" -# valid_params = self.get_params(deep=True) - -# for key, value in params.items(): -# if key in valid_params: -# if "__" in key: -# # This is a nested parameter (function parameter) -# # Store it for use when rebuilding the pipeline -# setattr(self, key, value) -# else: -# # This is a top-level parameter -# setattr(self, key, value) -# if key not in self._param_names: -# self._param_names.add(key) -# else: -# raise ValueError(f"Invalid parameter {key}") \ No newline at end of file diff --git a/examples/pipelines/experiments/pipefunc_ex.py b/examples/pipelines/experiments/pipefunc_ex.py deleted file mode 100644 index 05e2617..0000000 --- a/examples/pipelines/experiments/pipefunc_ex.py +++ /dev/null @@ -1,67 +0,0 @@ -import numpy as np -from skimage import data, filters, measure -from skimage.color import rgb2gray -from skimage.segmentation import find_boundaries - -from pipefunc import Pipeline, pipefunc - - -# Step 1: Image Loading and Preprocessing -@pipefunc(output_name="gray_image", mapspec="image[n] -> gray_image[n]") -def load_and_preprocess_image(image): - return rgb2gray(image) - - -# Step 2: Image Segmentation -@pipefunc(output_name="segmented_image", mapspec="gray_image[n] -> segmented_image[n]") -def segment_image(gray_image): - return filters.sobel(gray_image) - - -# Step 3: Feature Extraction -@pipefunc(output_name="feature", mapspec="segmented_image[n] -> feature[n]") -def extract_feature(segmented_image): - boundaries = find_boundaries(segmented_image > 0.1) - labeled_image = measure.label(boundaries) - num_regions = np.max(labeled_image) - return {"num_regions": num_regions} - - -# Step 4: Object Classification -@pipefunc(output_name="classification", mapspec="feature[n] -> classification[n]") -def classify_object(feature): - # Classify image as 'Complex' if the number of regions is above a threshold. - classification = "Complex" if feature["num_regions"] > 5 else "Simple" - return classification - - -# Step 5: Result Aggregation -@pipefunc(output_name="summary") -def aggregate_results(classification): - simple_count = sum(1 for c in classification if c == "Simple") - complex_count = len(classification) - simple_count - return {"Simple": simple_count, "Complex": complex_count} - - -if __name__ == "__main__": - # Create the pipeline - pipeline_img = Pipeline( - [ - load_and_preprocess_image, - segment_image, - extract_feature, - classify_object, - aggregate_results, - ], - ) - - # Simulate a batch of images (using built-in scikit-image sample images) - images = [ - data.astronaut(), - data.coffee(), - data.coffee(), - ] # Repeat the coffee image to simulate multiple images - - # Run the pipeline on the images - results_summary = pipeline_img.map({"image": images}) - print("Classification Summary:", results_summary["summary"].output) \ No newline at end of file diff --git a/examples/pipelines/experiments/pipefunc_tensile2d.py b/examples/pipelines/experiments/pipefunc_tensile2d.py deleted file mode 100644 index 54f3966..0000000 --- a/examples/pipelines/experiments/pipefunc_tensile2d.py +++ /dev/null @@ -1,132 +0,0 @@ -from datasets import load_from_disk, load_dataset -from plaid.containers.sample import Sample -from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition -import os, pickle -import numpy as np -from safetensors.numpy import save_file -from sklearn.base import BaseEstimator, RegressorMixin - -from pipefunc import Pipeline, pipefunc -from sklearn.preprocessing import StandardScaler - -from pathlib import Path -import joblib - -import yaml -import time - - - -@pipefunc(output_name=("dataset", "prob_def")) -def load_hf_from_disk(path): - return huggingface_dataset_to_plaid(load_from_disk(path)) - - -@pipefunc(output_name=("dataset", "prob_def")) -def load_hf_from_hub(path): - start = time.time() - hf_dataset = load_dataset(path, split="all_samples") - print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") - dataset = huggingface_dataset_to_plaid(hf_dataset) - return dataset - - -@pipefunc(output_name=("scalar_data")) -def scale_scalars(dataset, prob_def, train_split_name, test_split_name, out_path): - - ids_train = prob_def.get_split(train_split_name) - input_scalars_train = dataset.get_scalars_to_tabular( - scalar_names = prob_def.get_input_scalars_names(), - sample_ids = ids_train, - as_nparray = True - ) - output_scalars_train = dataset.get_scalars_to_tabular( - scalar_names = prob_def.get_output_scalars_names(), - sample_ids = ids_train, - as_nparray = True - ) - - ids_test = prob_def.get_split(test_split_name) - input_scalars_test = dataset.get_scalars_to_tabular( - scalar_names = prob_def.get_input_scalars_names(), - sample_ids = ids_test, - as_nparray = True - ) - - input_scalar_scaler = StandardScaler() - input_scalars_train = input_scalar_scaler.fit_transform(input_scalars_train) - input_scalars_test = input_scalar_scaler.transform(input_scalars_test) - - output_scalar_scaler = StandardScaler() - output_scalars_train = output_scalar_scaler.fit_transform(output_scalars_train) - - scalar_data = [ - input_scalar_scaler, - output_scalar_scaler, - input_scalars_train, - input_scalars_test, - output_scalars_train - ] - - os.makedirs(out_path, exist_ok=True) - - tensors = { - "scaler_mean": input_scalar_scaler.mean_, - "scaler_scale": input_scalar_scaler.scale_, - } - saved_path = os.path.join(out_path, f"input_scalar_scaler.safetensors") - save_file(tensors, saved_path) - - tensors = { - "scaler_mean": output_scalar_scaler.mean_, - "scaler_scale": output_scalar_scaler.scale_, - } - saved_path = os.path.join(out_path, f"output_scalar_scaler.safetensors") - save_file(tensors, saved_path) - - return scalar_data - - -def extract_leaf_keys(d): - leaves = {} - if isinstance(d, dict): - for k, v in d.items(): - if isinstance(v, dict) or isinstance(v, list): - leaves.update(extract_leaf_keys(v)) - else: - leaves[k] = v - elif isinstance(d, list): - for item in d: - leaves.update(extract_leaf_keys(item)) - return leaves - - -if __name__ == "__main__": - - start = time.time() - - pipeline = Pipeline( - [ - load_hf_from_hub, - scale_scalars, - ], - name="ML_Workflow", - profile=True - ) - - # pipeline.visualize() - - with open("config.yml") as f: - config = yaml.safe_load(f) - - parameters = extract_leaf_keys(config) - - scalar_data = pipeline(**parameters) - - print(f"Pipeline execution time {time.time() - start:.2g} seconds") - - - pipeline.print_profiling_stats() - # print("Dataset:", type(dataset[0:10]), type(dataset)) - - # print(scalar_data) diff --git a/examples/pipelines/experiments/pipefunc_tensile2d_2.py b/examples/pipelines/experiments/pipefunc_tensile2d_2.py deleted file mode 100644 index e143006..0000000 --- a/examples/pipelines/experiments/pipefunc_tensile2d_2.py +++ /dev/null @@ -1,104 +0,0 @@ -from datasets import load_from_disk, load_dataset -from plaid.containers.sample import Sample -from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition -import os, pickle -import numpy as np -from safetensors.numpy import save_file -from sklearn.base import BaseEstimator, RegressorMixin - -from pipefunc import Pipeline, pipefunc -from sklearn.preprocessing import StandardScaler - -from pathlib import Path -import joblib - -import yaml -import time - -from ml_pipeline_nodes import ScalarScalerNode - - -@pipefunc(output_name=("dataset", "prob_def")) -def load_hf_from_hub(path): - start = time.time() - hf_dataset = load_dataset(path, split="all_samples") - print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") - dataset = huggingface_dataset_to_plaid(hf_dataset) - return dataset - - -@pipefunc(output_name=("scaled_dataset")) -def scale_scalars(dataset, prob_def, train_split_name, test_split_name): - - scalar_scaler = ScalarScalerNode() - - ids_train = prob_def.get_split(train_split_name) - input_scalars_train = dataset.get_scalars_to_tabular( - scalar_names = prob_def.get_input_scalars_names(), - sample_ids = ids_train, - as_nparray = True - ) - output_scalars_train = dataset.get_scalars_to_tabular( - scalar_names = prob_def.get_output_scalars_names(), - sample_ids = ids_train, - as_nparray = True - ) - - ids_test = prob_def.get_split(test_split_name) - input_scalars_test = dataset.get_scalars_to_tabular( - scalar_names = prob_def.get_input_scalars_names(), - sample_ids = ids_test, - as_nparray = True - ) - - input_scalar_scaler = StandardScaler() - input_scalars_train = input_scalar_scaler.fit_transform(input_scalars_train) - input_scalars_test = input_scalar_scaler.transform(input_scalars_test) - - output_scalar_scaler = StandardScaler() - output_scalars_train = output_scalar_scaler.fit_transform(output_scalars_train) - - return dataset - - -def extract_leaf_keys(d): - leaves = {} - if isinstance(d, dict): - for k, v in d.items(): - if isinstance(v, dict) or isinstance(v, list): - leaves.update(extract_leaf_keys(v)) - else: - leaves[k] = v - elif isinstance(d, list): - for item in d: - leaves.update(extract_leaf_keys(item)) - return leaves - - -if __name__ == "__main__": - - start = time.time() - - pipeline = Pipeline( - [ - load_hf_from_hub, - scale_scalars, - ], - name="ML_Workflow", - profile=True - ) - - # pipeline.visualize() - - with open("config_2.yml") as f: - parameters = yaml.safe_load(f) - - scalar_data = pipeline(**parameters) - - print(f"Pipeline execution time {time.time() - start:.2g} seconds") - - - pipeline.print_profiling_stats() - # print("Dataset:", type(dataset[0:10]), type(dataset)) - - # print(scalar_data) diff --git a/examples/pipelines/experiments/regressor_node.py b/examples/pipelines/experiments/regressor_node.py deleted file mode 100644 index 7afcbc7..0000000 --- a/examples/pipelines/experiments/regressor_node.py +++ /dev/null @@ -1,28 +0,0 @@ -from sklearn.linear_model import Ridge -from sklearn.base import RegressorMixin -from persistent_node import PersistentNode - - -class RegressorNode(PersistentNode, RegressorMixin): - def __init__(self, save_path="models/regressor.joblib"): - super().__init__(save_path) - self.model = Ridge() - - def __call__(self, X, y=None): - if y is not None: - self.fit(X, y) - return self.predict(X) - - def fit(self, X, y): - if self.load_if_cached((X, y)): - return self - self.model.fit(X, y) - self.save_model_with_hash(self.model, (X, y)) - return self - - def predict(self, X): - self.check_fitted_or_load() - return self.model.predict(X) - - def inverse_transform(self, y_pred): - return y_pred diff --git a/examples/pipelines/experiments/scaler_node.py b/examples/pipelines/experiments/scaler_node.py deleted file mode 100644 index 42d37ef..0000000 --- a/examples/pipelines/experiments/scaler_node.py +++ /dev/null @@ -1,33 +0,0 @@ -from sklearn.preprocessing import StandardScaler -from persistent_node import PersistentNode - - -class ScalerNode(PersistentNode): - def __init__(self, save_path="models/scaler.joblib"): - super().__init__(save_path) - self.model = StandardScaler() - - def __call__(self, X): - return self.fit_transform(X) - - def fit(self, X, y=None): - if self.load_if_cached(X): - return self - self.model.fit(X) - self.save_model_with_hash(self.model, X) - return self - - def transform(self, X): - self.check_fitted_or_load() - return self.model.transform(X) - - def fit_transform(self, X, y=None): - if self.load_if_cached(X): - return self.model.transform(X) - self.model.fit(X) - self.save_model_with_hash(self.model, X) - return self.model.transform(X) - - def inverse_transform(self, X_scaled): - self.check_fitted_or_load() - return self.model.inverse_transform(X_scaled) \ No newline at end of file diff --git a/examples/pipelines/experiments/sklearn_hf_pipeline.py b/examples/pipelines/experiments/sklearn_hf_pipeline.py deleted file mode 100644 index 9270528..0000000 --- a/examples/pipelines/experiments/sklearn_hf_pipeline.py +++ /dev/null @@ -1,90 +0,0 @@ -from datasets import load_dataset -from plaid.containers.sample import Sample -from plaid.containers.dataset import Dataset -from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition -import os, pickle -import numpy as np -from safetensors.numpy import save_file -from sklearn.base import BaseEstimator, RegressorMixin - -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler - -from pathlib import Path -import joblib - -import yaml -import time - -from ml_pipeline_HF_nodes import ScalarScalerNode, GPRegressorNode - - -with open("config_2.yml") as f: - params = yaml.safe_load(f) - - -class HFDataset(Dataset): - def __init__(self, hf_dataset = None, path = None): - assert not (hf_dataset and path), "hf_dataset and path cannot be both initialized" - assert hf_dataset or path, "hf_dataset and path cannot be both not initialized" - if hf_dataset: - self.ds = hf_dataset - elif path: - self.ds = load_dataset(path, split="all_samples") - - def __getitem__(self, idx): - if isinstance(idx, slice): - return HFDataset(self.ds[idx]) - return Sample.model_validate(pickle.loads(self.ds[idx]["sample"])) - - def __len__(self): - return len(self.ds) - - def __iter__(self): - for i in range(len(self)): - yield self[i] - - def __getattr__(self, name): - attr = getattr(self.ds, name) - if callable(attr): - def wrapper(*args, **kwargs): - result = attr(*args, **kwargs) - if isinstance(result, type(self.ds)): - return HFDataset(result) - return result - return wrapper - return attr - - -start = time.time() -dataset = HFDataset(path = params['dataset_path']) -print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") - -params['prob_def'] = huggingface_description_to_problem_definition(dataset.description) - - -pipeline = Pipeline([ - ('scalar_scaler', ScalarScalerNode(name = 'scalar_scaler', params = params)), - # ('tabular_regressor', GPRegressorNode(name = 'tabular_regressor', params = params)) -]) - -print(dataset[0].get_scalar('max_U2_top')) - - -pipeline.fit_transform(dataset) -print("pipeline fitted") - -print(dataset[0].get_scalar('max_U2_top')) - -# dataset_2 = pipeline.inverse_transform(pipeline.predict(dataset)) - -# print(dataset_2[0].get_scalar('max_U2_top')) - -# dataset_3 = pipeline.inverse_transform(dataset_2) - -# print(dataset_3[0].get_scalar('p1')) - - - -# dataset_4 = pipeline.inverse_transform(pipeline.fit_transform(dataset)) -# print(dataset_4[0].get_scalar('p1')) diff --git a/examples/pipelines/with_persistence/config_2.yml b/examples/pipelines/with_persistence/config_2.yml deleted file mode 100644 index 10d870a..0000000 --- a/examples/pipelines/with_persistence/config_2.yml +++ /dev/null @@ -1,62 +0,0 @@ -global: - dataset_path: "PLAID-datasets/VKI-LS59" - save_path: "./artifacts" - train_split_name: "train" - test_split_name: "test" - -input_scalar_scaler: - type: "MinMaxScaler" - scalar_names: - - angle_in - - mach_out - -output_scalar_scaler: - type: "MinMaxScaler" - scalar_names: - - Q - - power - - Pr - - Tr - - eth_is - - angle_out - -pca_shape_embedding: - type: "PCA" - base_name: Base_2_2 - n_components: - nodes: 3 - -pca_field_embedding: - type: "PCA" - base_name: Base_2_2 - n_components: - mach: 5 - nut: 8 - -tabular_regressor: - type: "GaussianProcessRegressor" - options: - kernel: Matern - kernel_options: - nu: 2.5 - optim: fmin_l_bfgs_b - num_restarts: 10 - anisotropic: True - random_state: 42 - show_warnings: False - input: - type: scalar - names: - - angle_in - - mach_out - - reduced_nodes_0 - - reduced_nodes_1 - - reduced_nodes_2 - output: - type: scalar - names: - - reduced_mach_0 - - reduced_mach_1 - - reduced_mach_2 - - reduced_mach_3 - - reduced_mach_4 \ No newline at end of file diff --git a/examples/pipelines/with_persistence/ml_pipeline_nodes.py b/examples/pipelines/with_persistence/ml_pipeline_nodes.py deleted file mode 100644 index c0575b6..0000000 --- a/examples/pipelines/with_persistence/ml_pipeline_nodes.py +++ /dev/null @@ -1,424 +0,0 @@ -import os -import json -import joblib -from joblib import Parallel, delayed -import numpy as np -from pathlib import Path -from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin -from sklearn.linear_model import Ridge -from sklearn.decomposition import PCA - - -# def flatten_dict(d, prefix=''): -# items = {} -# for k, v in d.items(): -# new_key = f'{prefix}__{k}' if prefix else k -# if isinstance(v, dict): -# items.update(flatten_dict(v, new_key)) -# else: -# items[f'param__{new_key}'] = v -# return items - -# def unflatten_dict(flat): -# nested = {} -# for flat_key, value in flat.items(): -# keys = flat_key.split('__') -# d = nested -# for k in keys[:-1]: -# d = d.setdefault(k, {}) -# d[keys[-1]] = value -# return nested - - -class PersistentNode(BaseEstimator, RegressorMixin, TransformerMixin): - def __init__(self, name, global_params, params): - self.name = name - self.global_params = global_params - self.params = params - self.save_path = Path(os.path.join(global_params['save_path'], f"{name}.joblib")) - self.fitted_ = False - self.model = None - - # def get_params(self, deep=True): - # return flatten_dict(self.params[self.name]) - - # def set_params(self, **kwargs): - # if 'name' in kwargs: - # self.name = kwargs.pop('name') - # if 'params' in kwargs: - # self.params.update(kwargs.pop('params')) - - # # Extract param__ keys and merge into self.params - # flat = {} - # for k, v in kwargs.items(): - # if k.startswith('param__'): - # flat_key = k[len('param__'):] - # flat[flat_key] = v - # nested_update = unflatten_dict(flat) - # self.params = self._deep_merge(self.params, nested_update) - # return self - - def save(self, obj): - self.save_path.parent.mkdir(parents=True, exist_ok=True) - joblib.dump(obj, self.save_path) - self.fitted_ = True - - def load(self): - print(f"Loading existing model from {self.save_path}") - obj = joblib.load(self.save_path) - self.fitted_ = True - return obj - - def exists(self): - return self.save_path.exists() - - def check_fitted_or_load(self): - if self.fitted_: - return - if self.exists(): - self.set_model(self.load()) - else: - raise ValueError("Model not fitted and no saved model found.") - - def set_model(self, obj): - self.model = obj - - def fit(self, X, y=None): - if self.exists(): - self.set_model(self.load()) - return self - self._fit(X, y) - self.save(self.model) - return self - - def transform(self, X): - self.check_fitted_or_load() - return self._transform(X) - - def inverse_transform(self, X): - self.check_fitted_or_load() - return self._inverse_transform(X) - - def predict(self, X): - self.check_fitted_or_load() - return self._predict(X) - - def score(self, X, y): - self.check_fitted_or_load() - return self._score(X, y) - - # Protected methods to override in subclasses - def _fit(self, X, y=None): - raise NotImplementedError - - def _predict(self, X): - raise NotImplementedError - - def _transform(self, X): - raise NotImplementedError - - def _inverse_transform(self, X): - raise NotImplementedError - - def _score(self, X, y): - raise NotImplementedError - - - -from sklearn.preprocessing import StandardScaler, MinMaxScaler - -available_scalers = { - "StandardScaler": StandardScaler, - "MinMaxScaler": MinMaxScaler, -} - -class ScalarScalerNode(PersistentNode): - - def __init__(self, name, global_params, params): - super().__init__(name, global_params, params) - self.scalar_names = params['scalar_names'] - self.model = available_scalers[params['type']]() - - def get_scalars(self, dataset): - return dataset.get_scalars_to_tabular( - scalar_names = self.scalar_names, - as_nparray = True - ) - - def set_scalars(self, dataset, scalars): - for i in range(len(dataset)): - for j, sn in enumerate(self.scalar_names): - dataset[i].add_scalar(sn, scalars[i, j]) - - def _fit(self, dataset, y=None): - scalars = self.get_scalars(dataset) - self.model.fit(scalars) - - def _transform(self, dataset): - scalars = self.get_scalars(dataset) - scaled_scalars = self.model.transform(scalars) - self.set_scalars(dataset, scaled_scalars) - - return dataset - - def _inverse_transform(self, dataset): - scaled_scalars = self.get_scalars(dataset) - scalars = self.model.inverse_transform(scaled_scalars) - self.set_scalars(dataset, scalars) - - return dataset - - -class PCAEmbeddingNode(PersistentNode): - - def __init__(self, name, global_params, params): - super().__init__(name, global_params, params) - - self.zone_name = params["zone_name"] if "zone_name" in params else None - self.base_name = params["base_name"] if "base_name" in params else None - self.time = params["time"] if "time" in params else None - self.location = params["location"] if "location" in params else "Vertex" - - assert params['type'] == "PCA" - self.n_components = params['n_components'] - self.field_names = list(self.n_components.keys()) - self.model = {name: PCA(n_components = nc) for name, nc in self.n_components.items()} - - def get_params(self, deep=True): - return {f"n_components__{key}":val for key, val in self.n_components.items()} - - def get_all_fields(self, dataset, fn): - all_fields = [] - for sample in dataset: - if fn == "nodes": - field = sample.get_nodes(self.zone_name, self.base_name, self.time).flatten() - else: - field = sample.get_field(fn, self.zone_name, self.base_name, self.location, self.time) - all_fields.append(field) - return np.array(all_fields) - - - def set_reduced_fields(self, dataset, fn, reduced_fields): - for i in range(len(dataset)): - for j in range(self.n_components[fn]): - dataset[i].add_scalar(f"reduced_{fn}_{j}", reduced_fields[i, j]) - - def get_reduced_fields(self, dataset, fn): - return dataset.get_scalars_to_tabular( - scalar_names = [f"reduced_{fn}_{j}" for j in range(self.n_components[fn])], - as_nparray = True - ) - - def set_fields(self, dataset, fn, fields): - for i in range(len(dataset)): - dataset[i].add_field(fn, fields[i], self.zone_name, self.base_name, self.location, self.time) - - def _fit(self, dataset, y=None): - for fn in self.field_names: - all_fields = self.get_all_fields(dataset, fn) - self.model[fn].fit(all_fields) - - def _transform(self, dataset): - for fn in self.field_names: - all_fields = self.get_all_fields(dataset, fn) - reduced_fields = self.model[fn].transform(all_fields) - self.set_reduced_fields(dataset, fn, reduced_fields) - return dataset - - def _inverse_transform(self, dataset): - for fn in self.field_names: - reduced_fields = self.get_reduced_fields(dataset, fn) - fields = self.model[fn].inverse_transform(reduced_fields) - self.set_fields(dataset, fn, fields) - return dataset - - -# from Muscat.Containers import MeshGraphTools as MGT -# from Muscat.Bridges.CGNSBridge import CGNSToMesh, MeshToCGNS -# from Muscat.Containers import MeshModificationTools as MMT - -# import sys -# from contextlib import contextmanager - -# @contextmanager -# def suppress_stdout(): -# original_stdout = sys.stdout -# sys.stdout = open(os.devnull, 'w') -# try: -# yield -# finally: -# sys.stdout.close() -# sys.stdout = original_stdout - -# class TutteMorphing(PersistentNode): -# def __init__(self, name, params): -# super().__init__(params['save_path'], name) -# self.prob_def = params['prob_def'] -# self.loc_params = params[name] -# self.model = {} - -# def fit(self, X, y=None): -# # No fitting needed here -# return self - -# def transform(self, dataset): -# return Parallel(n_jobs=self.loc_params['n_jobs'])( -# delayed(self._process_row)(sample) for sample in dataset -# ) - -# # def inverse_transform(self, dataset): -# # return Parallel(n_jobs=self.n_jobs)( -# # delayed(self._process_row)(sample) for sample in dataset -# # ) - - -# def _process_row(self, sample): -# # Your custom transformation logic - -# mesh = CGNSToMesh(sample.get_mesh()) - -# with suppress_stdout(): -# mesh_renumb, renumbering, n_boundary = MGT.RenumberMeshForParametrization( -# mesh, inPlace=False) -# mesh_renumb.elemFields = mesh_renumb.nodeFields = {} -# morphed_mesh, _ = MGT.FloaterMeshParametrization( -# mesh_renumb, n_boundary) - -# # ---# Check invariance -# assert (np.all(renumbering == np.argsort(np.argsort(renumbering)))) -# MMT.NodesPermutation(morphed_mesh, np.argsort(renumbering)) - -# sample.del_tree(time = 0.) -# sample.add_tree(MeshToCGNS(morphed_mesh)) - -# return sample - - - - - -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel -from sklearn.multioutput import MultiOutputRegressor - -available_kernel_classes = { - "Matern":Matern -} - -class GPRegressorNode(PersistentNode): - - def __init__(self, name, global_params, params): - super().__init__(name, global_params, params) - - self.loc_params = params - assert self.loc_params['type'] == "GaussianProcessRegressor" - - options = self.loc_params['options'] - assert options["kernel"] in available_kernel_classes.keys(), "scikit-learn kernel "+self.options["kernel"]+" not available" - kernel_class = available_kernel_classes[options["kernel"]] - - self.input_names = self.loc_params['input']['names'] - self.output_names = self.loc_params['output']['names'] - - if options["anisotropic"]: - kernel = ConstantKernel() * kernel_class(length_scale=np.ones(len(self.input_names)), length_scale_bounds=(1e-8, 1e8), - **options["kernel_options"]) + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) - else: - kernel = kernel_class(length_scale_bounds=(1e-8, 1e8), **options["kernel_options"]) \ - + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) - - gpr = GaussianProcessRegressor( - kernel=kernel, - optimizer=options["optim"], - n_restarts_optimizer=options["num_restarts"], - random_state = options["random_state"]) - - self.model = MultiOutputRegressor(gpr) - - - def get_scalars(self, dataset): - return dataset.get_scalars_to_tabular( - scalar_names = self.input_names, - as_nparray = True - ) - - def _fit(self, dataset, y=None): - X = dataset.get_scalars_to_tabular( - scalar_names = self.input_names, - as_nparray = True - ) - y = dataset.get_scalars_to_tabular( - scalar_names = self.output_names, - as_nparray = True - ) - self.model.fit(X, y) - - def _predict(self, dataset): - X = dataset.get_scalars_to_tabular( - scalar_names = self.input_names, - as_nparray = True - ) - - pred= self.model.predict(X) - if len(self.output_names) == 1: - pred = pred.reshape((-1, 1)) - - for i in range(len(dataset)): - for j, sn in enumerate(self.output_names): - dataset[i].add_scalar(sn, pred[i, j]) - - return dataset - - def _transform(self, dataset): - return dataset - - def _inverse_transform(self, dataset): - return dataset - - def _score(self, dataset, dataset_ref): - X = dataset.get_scalars_to_tabular( - scalar_names = self.input_names, - as_nparray = True - ) - y = dataset_ref.get_scalars_to_tabular( - scalar_names = self.output_names, - as_nparray = True - ) - return self.model.score(X, y) - - -class ScalerNode(PersistentNode): - def __init__(self, name, save_path): - super().__init__(save_path, name) - - self.model = StandardScaler() - - def _fit(self, X, y=None): - self.model.fit(X) - - def _transform(self, X): - return self.model.transform(X) - - def _inverse_transform(self, X): - return self.model.inverse_transform(X) - - def _predict(self, X): - raise AttributeError("ScalarScalerNode does not support predict.") - - -class RegressorNode(PersistentNode): - def __init__(self, name, save_path, alpha=1.0): - super().__init__(save_path, name) - self.model = Ridge(alpha=alpha) - - def _fit(self, X, y): - self.model.fit(X, y) - - def _predict(self, X): - return self.model.predict(X) - - def _transform(self, X): - raise AttributeError("RegressorNode does not support transform.") - - def _inverse_transform(self, X): - raise AttributeError("RegressorNode does not support inverse_transform.") diff --git a/examples/pipelines/with_persistence/sklearn_pipeline.py b/examples/pipelines/with_persistence/sklearn_pipeline.py deleted file mode 100644 index bdefffe..0000000 --- a/examples/pipelines/with_persistence/sklearn_pipeline.py +++ /dev/null @@ -1,92 +0,0 @@ -from datasets import load_dataset -from plaid.containers.sample import Sample -from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition -import os, pickle -import numpy as np -from safetensors.numpy import save_file -from sklearn.base import BaseEstimator, RegressorMixin - -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler - -from pathlib import Path -import joblib - -import yaml -import time - -from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode #, TutteMorphing - - -with open("config_2.yml") as f: - config = yaml.safe_load(f) - -global_params = config["global"] - - -start = time.time() -hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples") -print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds") - -prob_def = huggingface_description_to_problem_definition(hf_dataset.description) - -ref_split = prob_def.get_split(global_params['train_split_name'])[:20] -train_split = prob_def.get_split(global_params['train_split_name'])[10:20] -test_split = prob_def.get_split(global_params['train_split_name'])[:20] - -dataset_ref, _ = huggingface_dataset_to_plaid(hf_dataset, ids = ref_split) -dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split) -dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split) - - - - -pipeline = Pipeline([ - ('input_scalar_scaler', ScalarScalerNode(name = 'input_scalar_scaler', global_params = global_params, params = config['input_scalar_scaler'])), - ('output_scalar_scaler', ScalarScalerNode(name = 'output_scalar_scaler', global_params = global_params, params = config['output_scalar_scaler'])), - ('pca_shape_embedding', PCAEmbeddingNode(name = 'pca_shape_embedding', global_params = global_params, params = config['pca_shape_embedding'])), - ('pca_field_embedding', PCAEmbeddingNode(name = 'pca_field_embedding', global_params = global_params, params = config['pca_field_embedding'])), - ('tabular_regressor', GPRegressorNode(name = 'tabular_regressor', global_params = global_params, params = config['tabular_regressor'])) -]) - -print("pipeline parameters=", pipeline.get_params(deep=True)) - -1./0. - -ind = train_split[0] - -pipeline.fit(dataset_train) -print("pipeline fitted") - -dataset_ref = pipeline.transform(dataset_ref) - -dataset_test_2 = pipeline.inverse_transform(pipeline.predict(dataset_ref)) - -print("score =", pipeline.score(dataset_test_2, dataset_ref)) - -print(dataset_test_2) - -dataset_ref._save_to_dir_(os.path.join(params['save_path'], "dataset_ref"), verbose = True) -dataset_test_2._save_to_dir_(os.path.join(params['save_path'], "dataset_test_2"), verbose = True) - - -# print(dataset_train[ind]) -# print(dataset_train[ind].get_scalar_names()) - -1./0. -dataset_2 = pipeline.inverse_transform(pipeline.predict(dataset)) - -print(dataset_2[0].get_scalar('max_U2_top')) - -# dataset_3 = pipeline.inverse_transform(dataset_2) - -# print(dataset_3[0].get_scalar('p1')) - - - -# dataset_4 = pipeline.inverse_transform(pipeline.fit_transform(dataset)) -# print(dataset_4[0].get_scalar('p1')) - - -test_split = prob_def.get_split(params['test_split_name']) -dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split) From 16c726831d983ce9d0ba4949a7a9d40f2db40d66 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Mon, 30 Jun 2025 18:36:18 +0200 Subject: [PATCH 12/19] fix(huggingface_bridge) coverage: ignore line not reported with multiprocessing --- src/plaid/bridges/huggingface_bridge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plaid/bridges/huggingface_bridge.py b/src/plaid/bridges/huggingface_bridge.py index 4a46ae2..a767629 100644 --- a/src/plaid/bridges/huggingface_bridge.py +++ b/src/plaid/bridges/huggingface_bridge.py @@ -173,7 +173,7 @@ class HFToPlaidSampleConverter: def __init__(self, ds): self.ds = ds - def __call__(self, ind): + def __call__(self, ind): # pragma: no cover (not reported with multiprocessing) """Convert a single sample from the huggingface dataset to a plaid sample.""" return Sample.model_validate(pickle.loads(self.ds[ind]["sample"])) @@ -184,7 +184,7 @@ class HFShardToPlaidSampleConverter: def __init__(self, shard_path): self.ds = load_from_disk(shard_path) - def __call__(self, idx): + def __call__(self, idx): # pragma: no cover (not reported with multiprocessing) """Convert a sample shard from the huggingface dataset to a plaid sample.""" sample = self.ds[idx] return Sample.model_validate(pickle.loads(sample["sample"])) From 9f234880e90e83d248c00b61015b29896d6b4200 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Mon, 30 Jun 2025 18:41:22 +0200 Subject: [PATCH 13/19] fix(examples/pipelines) remove comment --- examples/pipelines/sklearn_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipelines/sklearn_pipeline.py b/examples/pipelines/sklearn_pipeline.py index 8fbcb04..8a6790c 100644 --- a/examples/pipelines/sklearn_pipeline.py +++ b/examples/pipelines/sklearn_pipeline.py @@ -17,7 +17,7 @@ import yaml import time -from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode #, TutteMorphing +from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode import warnings warnings.filterwarnings('ignore', module='sklearn') From 42f1b0e197167c9d9b8a49f0f3a0e36fd7387fe9 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Mon, 30 Jun 2025 19:39:51 +0200 Subject: [PATCH 14/19] fix(examples/pipelines) remove unused imports --- examples/pipelines/ml_pipeline_nodes.py | 13 +++++++------ examples/pipelines/sklearn_pipeline.py | 25 +++++++++++-------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/examples/pipelines/ml_pipeline_nodes.py b/examples/pipelines/ml_pipeline_nodes.py index 6f0aa37..43b1b54 100644 --- a/examples/pipelines/ml_pipeline_nodes.py +++ b/examples/pipelines/ml_pipeline_nodes.py @@ -1,11 +1,12 @@ -import os -import json -import joblib -from joblib import Parallel, delayed +# -*- coding: utf-8 -*- +# +# This file is subject to the terms and conditions defined in +# file 'LICENSE.txt', which is part of this source code package. +# +# + import numpy as np -from pathlib import Path from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin -from sklearn.linear_model import Ridge from sklearn.decomposition import PCA from plaid.containers.dataset import Dataset import copy diff --git a/examples/pipelines/sklearn_pipeline.py b/examples/pipelines/sklearn_pipeline.py index 8a6790c..8c9594b 100644 --- a/examples/pipelines/sklearn_pipeline.py +++ b/examples/pipelines/sklearn_pipeline.py @@ -1,21 +1,18 @@ +# -*- coding: utf-8 -*- +# +# This file is subject to the terms and conditions defined in +# file 'LICENSE.txt', which is part of this source code package. +# +# + +import os +import yaml +import time + from datasets import load_dataset -from plaid.containers.sample import Sample from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition -import os, pickle -import numpy as np -from safetensors.numpy import save_file -from sklearn.base import BaseEstimator, RegressorMixin from sklearn.model_selection import GridSearchCV - from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler - -from pathlib import Path -import joblib -from copy import copy - -import yaml -import time from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode From 53f00cb1341b7bb19ba8fb565ac6ebf030c3082b Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Tue, 1 Jul 2025 07:21:00 +0200 Subject: [PATCH 15/19] fix(pipeline) configure parallel conversion in example and remove time arg in PCAEmbeddingNode --- examples/pipelines/ml_pipeline_nodes.py | 24 ++++++++++++------------ examples/pipelines/sklearn_pipeline.py | 8 ++++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/examples/pipelines/ml_pipeline_nodes.py b/examples/pipelines/ml_pipeline_nodes.py index 43b1b54..2bd3289 100644 --- a/examples/pipelines/ml_pipeline_nodes.py +++ b/examples/pipelines/ml_pipeline_nodes.py @@ -67,26 +67,25 @@ def inverse_transform(self, dataset): class PCAEmbeddingNode(BaseEstimator, RegressorMixin, TransformerMixin): - def __init__(self, field_name = None, n_components = None, zone_name = None, base_name = None, time = None, location = "Vertex"): + def __init__(self, field_name = None, n_components = None, zone_name = None, base_name = None, location = "Vertex"): + self.field_name = field_name + self.n_components = n_components self.zone_name = zone_name self.base_name = base_name - self.time = time self.location = location - self.field_name = field_name - self.n_components = n_components - self.model = None def get_all_fields(self, dataset): all_fields = [] for sample in dataset: - if self.field_name == "nodes": - field = sample.get_nodes(self.zone_name, self.base_name, self.time).flatten() - else: - field = sample.get_field(self.field_name, self.zone_name, self.base_name, self.location, self.time) - all_fields.append(field) + for time in sample.get_all_mesh_times(): + if self.field_name == "nodes": + field = sample.get_nodes(self.zone_name, self.base_name, time).flatten() + else: + field = sample.get_field(self.field_name, self.zone_name, self.base_name, self.location, time) + all_fields.append(field) return np.array(all_fields) def set_reduced_fields(self, dataset, reduced_fields): @@ -102,9 +101,10 @@ def get_reduced_fields(self, dataset): as_nparray = True ) - def set_fields(self, dataset, fields): + def set_fields(self, dataset, fields): # TODO: this will not work with multiple times step per sample for i in range(len(dataset)): - dataset[i].add_field(self.field_name, fields[i], self.zone_name, self.base_name, self.location, self.time) + for time in dataset[i].get_all_mesh_times(): + dataset[i].add_field(self.field_name, fields[i], self.zone_name, self.base_name, self.location, time) def fit(self, dataset, y=None): self.model = PCA(n_components = self.n_components) diff --git a/examples/pipelines/sklearn_pipeline.py b/examples/pipelines/sklearn_pipeline.py index 8c9594b..128c0fc 100644 --- a/examples/pipelines/sklearn_pipeline.py +++ b/examples/pipelines/sklearn_pipeline.py @@ -33,7 +33,7 @@ prob_def = huggingface_description_to_problem_definition(hf_dataset.description) train_split = prob_def.get_split(global_params['train_split_name'])[:100] -dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split) +dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = os.cpu_count()) pipeline = Pipeline([ @@ -72,10 +72,10 @@ print("Direct pipeline example:") train_split = prob_def.get_split(global_params['train_split_name']) -test_split = prob_def.get_split(global_params['train_split_name'])[:10] +test_split = prob_def.get_split(global_params['train_split_name'])[:24] -dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split) -dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split) +dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = os.cpu_count()) +dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split, processes_number = os.cpu_count()) From df0cb1f13daa01b77d3c879e5dba8ff858b2e96f Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Tue, 1 Jul 2025 07:52:47 +0200 Subject: [PATCH 16/19] feat(examples/pipelines) simplify arguments in pipeline nodes, by giving a global dict and specifying only arguments to be optimized by GridSearchCV (n_components of PCA for the moment) --- examples/pipelines/ml_pipeline_nodes.py | 70 +++++++++++++++---------- examples/pipelines/sklearn_pipeline.py | 8 +-- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/examples/pipelines/ml_pipeline_nodes.py b/examples/pipelines/ml_pipeline_nodes.py index 2bd3289..896bc69 100644 --- a/examples/pipelines/ml_pipeline_nodes.py +++ b/examples/pipelines/ml_pipeline_nodes.py @@ -20,14 +20,16 @@ class ScalarScalerNode(BaseEstimator, TransformerMixin): - def __init__(self, type, scalar_names): - self.type = type - self.scalar_names = scalar_names + def __init__(self, params): - assert type in available_scalers.keys(), "Scaler "+type+" not available" + self.params = params - self.model = None + self.type_ = params['type'] + self.scalar_names = params['scalar_names'] + assert self.type_ in available_scalers.keys(), "Scaler "+self.type_+" not available" + + self.model = None def get_scalars(self, dataset): if isinstance(dataset, list): @@ -43,7 +45,7 @@ def set_scalars(self, dataset, scalars): dataset[i].add_scalar(sn, scalars[i, j]) def fit(self, dataset, y=None): - self.model = available_scalers[self.type]() + self.model = available_scalers[self.type_]() scalars = self.get_scalars(dataset) self.model.fit(scalars) @@ -67,13 +69,16 @@ def inverse_transform(self, dataset): class PCAEmbeddingNode(BaseEstimator, RegressorMixin, TransformerMixin): - def __init__(self, field_name = None, n_components = None, zone_name = None, base_name = None, location = "Vertex"): + def __init__(self, params, n_components = None): - self.field_name = field_name - self.n_components = n_components - self.zone_name = zone_name - self.base_name = base_name - self.location = location + self.params = params + + self.n_components = n_components if n_components is not None else params['n_components'] + + self.field_name = params['field_name'] + self.zone_name = params['zone_name'] if 'zone_name' in params else None + self.base_name = params['base_name'] if 'base_name' in params else None + self.location = params['location'] if 'location' in params else "Vertex" self.model = None @@ -142,8 +147,15 @@ class GPRegressorNode(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, params): self.params = params - assert params['type'] == "GaussianProcessRegressor" - assert self.params['options']["kernel"] in available_kernel_classes.keys(), "scikit-learn kernel "+self.params['options']["kernel"]+" not available" + + self.type_ = params['type'] + self.input = params['input'] + self.output = params['output'] + self.options = params['options'] + + + assert self.type_ == "GaussianProcessRegressor" + assert self.options['kernel'] in available_kernel_classes.keys(), "scikit-learn kernel "+self.options['kernel']+" not available" self.model = None @@ -162,32 +174,32 @@ def fit(self, dataset, y=None): all_available_scalar = dataset.get_scalar_names() self.input_names = [] - if "scalar_names" in self.params['input']: - self.input_names += self.params['input']["scalar_names"] - if "vector_names" in self.params['input']: - for vn in self.params['input']["vector_names"]: + if "scalar_names" in self.input: + self.input_names += self.input["scalar_names"] + if "vector_names" in self.input: + for vn in self.input["vector_names"]: self.input_names += [s for s in all_available_scalar if s.startswith(vn)] self.output_names = [] - if "scalar_names" in self.params['output']: - self.output_names += self.params['output']["scalar_names"] - if "vector_names" in self.params['output']: - for vn in self.params['output']["vector_names"]: + if "scalar_names" in self.output: + self.output_names += self.output["scalar_names"] + if "vector_names" in self.output: + for vn in self.output["vector_names"]: self.output_names += [s for s in all_available_scalar if s.startswith(vn)] - kernel_class = available_kernel_classes[self.params['options']["kernel"]] - if self.params['options']["anisotropic"]: + kernel_class = available_kernel_classes[self.options['kernel']] + if self.options["anisotropic"]: kernel = ConstantKernel() * kernel_class(length_scale=np.ones(len(self.input_names)), length_scale_bounds=(1e-8, 1e8), - **self.params['options']["kernel_options"]) + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) + **self.options["kernel_options"]) + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) else: - kernel = kernel_class(length_scale_bounds=(1e-8, 1e8), **self.params['options']["kernel_options"]) \ + kernel = kernel_class(length_scale_bounds=(1e-8, 1e8), **self.options["kernel_options"]) \ + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) gpr = GaussianProcessRegressor( kernel=kernel, - optimizer=self.params['options']["optim"], - n_restarts_optimizer=self.params['options']["num_restarts"], - random_state = self.params['options']["random_state"]) + optimizer=self.options["optim"], + n_restarts_optimizer=self.options["num_restarts"], + random_state = self.options["random_state"]) self.model = MultiOutputRegressor(gpr) if isinstance(dataset, list): diff --git a/examples/pipelines/sklearn_pipeline.py b/examples/pipelines/sklearn_pipeline.py index 128c0fc..19d8575 100644 --- a/examples/pipelines/sklearn_pipeline.py +++ b/examples/pipelines/sklearn_pipeline.py @@ -37,10 +37,10 @@ pipeline = Pipeline([ - ('input_scalar_scaler', ScalarScalerNode(type = config['input_scalar_scaler']['type'], scalar_names = config['input_scalar_scaler']['scalar_names'])), - ('output_scalar_scaler', ScalarScalerNode(type = config['output_scalar_scaler']['type'], scalar_names = config['output_scalar_scaler']['scalar_names'])), - ('pca_nodes', PCAEmbeddingNode(field_name = config['pca_nodes']['field_name'], n_components = config['pca_nodes']['n_components'], base_name = config['pca_nodes']['base_name'])), - ('pca_mach', PCAEmbeddingNode(field_name = config['pca_mach']['field_name'], n_components = config['pca_mach']['n_components'], base_name = config['pca_mach']['base_name'])), + ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])), + ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])), + ('pca_nodes', PCAEmbeddingNode(params = config['pca_nodes'], n_components = config['pca_nodes']['n_components'])), + ('pca_mach', PCAEmbeddingNode(params = config['pca_mach'], n_components = config['pca_mach']['n_components'])), ('regressor_mach', GPRegressorNode(params = config['regressor_mach'])) ]) From f0e081c43e515962c11edb23a6ba869e026c0af3 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Tue, 1 Jul 2025 07:56:37 +0200 Subject: [PATCH 17/19] feat(examples/pipelines) add other pipeline definition with all arguments specified in config.yml --- examples/pipelines/sklearn_pipeline.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/pipelines/sklearn_pipeline.py b/examples/pipelines/sklearn_pipeline.py index 19d8575..540e937 100644 --- a/examples/pipelines/sklearn_pipeline.py +++ b/examples/pipelines/sklearn_pipeline.py @@ -35,7 +35,7 @@ train_split = prob_def.get_split(global_params['train_split_name'])[:100] dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = os.cpu_count()) - +# Pipeline with ``n_components`` specified in PCAEmbeddingNode for GridSearchCV pipeline = Pipeline([ ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])), ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])), @@ -77,6 +77,14 @@ dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = os.cpu_count()) dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split, processes_number = os.cpu_count()) +# Pipeline with all arguments specified in ``config.yml`` +pipeline = Pipeline([ + ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])), + ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])), + ('pca_nodes', PCAEmbeddingNode(params = config['pca_nodes'])), + ('pca_mach', PCAEmbeddingNode(params = config['pca_mach'])), + ('regressor_mach', GPRegressorNode(params = config['regressor_mach'])) +]) pipeline.fit(dataset_train) From e5051758720bdeae48cb20f8eee9143172297b4f Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Thu, 3 Jul 2025 20:48:29 +0200 Subject: [PATCH 18/19] fix(huggingface_bridge, dataset) typing improvement, print remove --- examples/pipelines/ml_pipeline_nodes.py | 6 +++--- src/plaid/bridges/huggingface_bridge.py | 6 +++--- src/plaid/containers/dataset.py | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/pipelines/ml_pipeline_nodes.py b/examples/pipelines/ml_pipeline_nodes.py index 896bc69..f66e40e 100644 --- a/examples/pipelines/ml_pipeline_nodes.py +++ b/examples/pipelines/ml_pipeline_nodes.py @@ -76,9 +76,9 @@ def __init__(self, params, n_components = None): self.n_components = n_components if n_components is not None else params['n_components'] self.field_name = params['field_name'] - self.zone_name = params['zone_name'] if 'zone_name' in params else None - self.base_name = params['base_name'] if 'base_name' in params else None - self.location = params['location'] if 'location' in params else "Vertex" + self.zone_name = params.get('zone_name') + self.base_name = params.get('base_name') + self.location = params.get('location', 'Vertex') self.model = None diff --git a/src/plaid/bridges/huggingface_bridge.py b/src/plaid/bridges/huggingface_bridge.py index a767629..28dc5f6 100644 --- a/src/plaid/bridges/huggingface_bridge.py +++ b/src/plaid/bridges/huggingface_bridge.py @@ -5,7 +5,7 @@ import shutil import sys from multiprocessing import Pool -from typing import Callable +from typing import Callable, Optional from datasets import load_from_disk from tqdm import tqdm @@ -192,9 +192,9 @@ def __call__(self, idx): # pragma: no cover (not reported with multiprocessing) def huggingface_dataset_to_plaid( ds: datasets.Dataset, - ids: list[int] = None, + ids: Optional[list[int]] = None, processes_number: int = 1, - large_dataset=False, + large_dataset: Optional[bool] = False, ) -> tuple[Self, ProblemDefinition]: """Use this function for converting a plaid dataset from a huggingface dataset. diff --git a/src/plaid/containers/dataset.py b/src/plaid/containers/dataset.py index d1abc1f..045056f 100644 --- a/src/plaid/containers/dataset.py +++ b/src/plaid/containers/dataset.py @@ -1063,8 +1063,7 @@ def __getitem__( Seealso: This function can also be called using `__call__()`. """ - if isinstance(id, slice) or isinstance(id, list) or isinstance(id, np.ndarray): - print(">>>", type(id)) + if isinstance(id, (slice, list, np.ndarray)): if isinstance(id, slice): id = list(range(*id.indices(len(self)))) dataset = Dataset() From 7481b065899240860ae9fd75a9243117a0adf293 Mon Sep 17 00:00:00 2001 From: Fabien Casenave Date: Thu, 17 Jul 2025 09:23:25 +0200 Subject: [PATCH 19/19] temp --- examples/pipelines/ml_pipeline_nodes.py | 18 + examples/pipelines/sklearn_pipeline.ipynb | 6109 +++++++++++++++++++++ 2 files changed, 6127 insertions(+) create mode 100644 examples/pipelines/sklearn_pipeline.ipynb diff --git a/examples/pipelines/ml_pipeline_nodes.py b/examples/pipelines/ml_pipeline_nodes.py index f66e40e..d099fd1 100644 --- a/examples/pipelines/ml_pipeline_nodes.py +++ b/examples/pipelines/ml_pipeline_nodes.py @@ -260,3 +260,21 @@ def score(self, dataset, dataset_ref): ) return self.model.score(X, y) + +class DatasetTargetTransformerRegressor(BaseEstimator, RegressorMixin): + def __init__(self, regressor, transformer): + self.regressor = regressor + self.transformer = transformer + + def fit(self, dataset, y=None): + # Appliquer la transformation sur le dataset (modifie y dans les samples) + transformed_dataset = self.transformer.fit_transform(dataset) + + # Entraîner le régressseur sur le dataset transformé + self.regressor.fit(transformed_dataset) + return self + + def predict(self, dataset): + # Prédiction dans l’espace transformé (log(y) par ex.) + dataset_pred_transformed = self.regressor.predict(dataset) + return self.transformer.inverse_transform(dataset_pred_transformed) diff --git a/examples/pipelines/sklearn_pipeline.ipynb b/examples/pipelines/sklearn_pipeline.ipynb new file mode 100644 index 0000000..0ee437f --- /dev/null +++ b/examples/pipelines/sklearn_pipeline.ipynb @@ -0,0 +1,6109 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/ssa/users/d582428/envs/plaid-devenv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "\n", + "import os\n", + "import yaml\n", + "import time\n", + "\n", + "from datasets import load_dataset, load_from_disk\n", + "from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer, TransformedTargetRegressor\n", + "\n", + "from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode, DatasetTargetTransformerRegressor\n", + "from sklearn.utils import estimator_html_repr\n", + "\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore', module='sklearn')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading dataset from HuggingFace Hub took: 0.055 seconds\n", + "Converting huggingface dataset to plaid dataset...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [00:00<00:00, 478.59it/s]\n" + ] + } + ], + "source": [ + "\n", + "with open(\"config.yml\") as f:\n", + " config = yaml.safe_load(f)\n", + "\n", + "global_params = config[\"global\"]\n", + "\n", + "\n", + "start = time.time()\n", + "# hf_dataset = load_dataset(global_params['dataset_path'], split=\"all_samples\")\n", + "hf_dataset = load_from_disk(global_params['dataset_path'])\n", + "print(f\"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds\")\n", + "\n", + "prob_def = huggingface_description_to_problem_definition(hf_dataset.description)\n", + "\n", + "train_split = prob_def.get_split(global_params['train_split_name'])[:100]\n", + "dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = 24)#os.cpu_count())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
ColumnTransformer(remainder='passthrough',\n",
+       "                  transformers=[('pca', PCA(n_components=8),\n",
+       "                                 [0, 1, 2, 3, 4, 5, 6, 7])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "ColumnTransformer(remainder='passthrough',\n", + " transformers=[('pca', PCA(n_components=8),\n", + " [0, 1, 2, 3, 4, 5, 6, 7])])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.decomposition import PCA\n", + "feats_to_reduce = list(range(8))\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\n", + " \"pca\",\n", + " PCA(n_components=8),\n", + " feats_to_reduce,\n", + " ),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "preprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('input_scalar_scaler',\n",
+       "                 ScalarScalerNode(params={'scalar_names': ['angle_in',\n",
+       "                                                           'mach_out'],\n",
+       "                                          'type': 'MinMaxScaler'})),\n",
+       "                ('pca_nodes',\n",
+       "                 PCAEmbeddingNode(n_components=3,\n",
+       "                                  params={'base_name': 'Base_2_2',\n",
+       "                                          'field_name': 'nodes',\n",
+       "                                          'n_components': 3}))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('input_scalar_scaler',\n", + " ScalarScalerNode(params={'scalar_names': ['angle_in',\n", + " 'mach_out'],\n", + " 'type': 'MinMaxScaler'})),\n", + " ('pca_nodes',\n", + " PCAEmbeddingNode(n_components=3,\n", + " params={'base_name': 'Base_2_2',\n", + " 'field_name': 'nodes',\n", + " 'n_components': 3}))])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessor = Pipeline([\n", + " ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])),\n", + " ('pca_nodes', PCAEmbeddingNode(params = config['pca_nodes'], n_components = config['pca_nodes']['n_components'])),\n", + "])\n", + "preprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('output_scalar_scaler',\n",
+       "                 ScalarScalerNode(params={'scalar_names': ['Q', 'power', 'Pr',\n",
+       "                                                           'Tr', 'eth_is',\n",
+       "                                                           'angle_out'],\n",
+       "                                          'type': 'MinMaxScaler'})),\n",
+       "                ('pca_mach',\n",
+       "                 PCAEmbeddingNode(n_components=5,\n",
+       "                                  params={'base_name': 'Base_2_2',\n",
+       "                                          'field_name': 'mach',\n",
+       "                                          'n_components': 5}))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('output_scalar_scaler',\n", + " ScalarScalerNode(params={'scalar_names': ['Q', 'power', 'Pr',\n", + " 'Tr', 'eth_is',\n", + " 'angle_out'],\n", + " 'type': 'MinMaxScaler'})),\n", + " ('pca_mach',\n", + " PCAEmbeddingNode(n_components=5,\n", + " params={'base_name': 'Base_2_2',\n", + " 'field_name': 'mach',\n", + " 'n_components': 5}))])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "postprocessor = Pipeline(\n", + " [\n", + " ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])),\n", + " ('pca_mach', PCAEmbeddingNode(params = config['pca_mach'], n_components = config['pca_mach']['n_components'])),\n", + " ]\n", + ")\n", + "postprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DatasetTargetTransformerRegressor(regressor=GPRegressorNode(params={'input': {'scalar_names': ['angle_in',\n",
+       "                                                                                               'mach_out'],\n",
+       "                                                                              'vector_names': ['reduced_nodes']},\n",
+       "                                                                    'options': {'anisotropic': True,\n",
+       "                                                                                'kernel': 'Matern',\n",
+       "                                                                                'kernel_options': {'nu': 2.5},\n",
+       "                                                                                'num_restarts': 2,\n",
+       "                                                                                'optim': 'fmin_l_bfgs_b',\n",
+       "                                                                                'random_state': 42,\n",
+       "                                                                                'show_warnings': False},\n",
+       "                                                                    'output': {'vector_names': ['reduced_mach']},\n",
+       "                                                                    'type': 'GaussianProcessRegressor'}),\n",
+       "                                  transformer=Pipeline(steps=[('output_scalar_scaler',\n",
+       "                                                               ScalarScalerNode(params={'scalar_names': ['Q',\n",
+       "                                                                                                         'power',\n",
+       "                                                                                                         'Pr',\n",
+       "                                                                                                         'Tr',\n",
+       "                                                                                                         'eth_is',\n",
+       "                                                                                                         'angle_out'],\n",
+       "                                                                                        'type': 'MinMaxScaler'})),\n",
+       "                                                              ('pca_mach',\n",
+       "                                                               PCAEmbeddingNode(n_components=5,\n",
+       "                                                                                params={'base_name': 'Base_2_2',\n",
+       "                                                                                        'field_name': 'mach',\n",
+       "                                                                                        'n_components': 5}))]))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "DatasetTargetTransformerRegressor(regressor=GPRegressorNode(params={'input': {'scalar_names': ['angle_in',\n", + " 'mach_out'],\n", + " 'vector_names': ['reduced_nodes']},\n", + " 'options': {'anisotropic': True,\n", + " 'kernel': 'Matern',\n", + " 'kernel_options': {'nu': 2.5},\n", + " 'num_restarts': 2,\n", + " 'optim': 'fmin_l_bfgs_b',\n", + " 'random_state': 42,\n", + " 'show_warnings': False},\n", + " 'output': {'vector_names': ['reduced_mach']},\n", + " 'type': 'GaussianProcessRegressor'}),\n", + " transformer=Pipeline(steps=[('output_scalar_scaler',\n", + " ScalarScalerNode(params={'scalar_names': ['Q',\n", + " 'power',\n", + " 'Pr',\n", + " 'Tr',\n", + " 'eth_is',\n", + " 'angle_out'],\n", + " 'type': 'MinMaxScaler'})),\n", + " ('pca_mach',\n", + " PCAEmbeddingNode(n_components=5,\n", + " params={'base_name': 'Base_2_2',\n", + " 'field_name': 'mach',\n", + " 'n_components': 5}))]))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regressor = DatasetTargetTransformerRegressor(\n", + " regressor=GPRegressorNode(params = config['regressor_mach']),\n", + " transformer=postprocessor,\n", + ")\n", + "regressor" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 Pipeline(steps=[('input_scalar_scaler',\n",
+       "                                  ScalarScalerNode(params={'scalar_names': ['angle_in',\n",
+       "                                                                            'mach_out'],\n",
+       "                                                           'type': 'MinMaxScaler'})),\n",
+       "                                 ('pca_nodes',\n",
+       "                                  PCAEmbeddingNode(n_components=3,\n",
+       "                                                   params={'base_name': 'Base_2_2',\n",
+       "                                                           'field_name': 'nodes',\n",
+       "                                                           'n_components': 3}))])),\n",
+       "                ('regressor',\n",
+       "                 DatasetTargetTransformerRegressor(regressor=GPRegressorNo...\n",
+       "                                                                                     'output': {'vector_names': ['reduced_mach']},\n",
+       "                                                                                     'type': 'GaussianProcessRegressor'}),\n",
+       "                                                   transformer=Pipeline(steps=[('output_scalar_scaler',\n",
+       "                                                                                ScalarScalerNode(params={'scalar_names': ['Q',\n",
+       "                                                                                                                          'power',\n",
+       "                                                                                                                          'Pr',\n",
+       "                                                                                                                          'Tr',\n",
+       "                                                                                                                          'eth_is',\n",
+       "                                                                                                                          'angle_out'],\n",
+       "                                                                                                         'type': 'MinMaxScaler'})),\n",
+       "                                                                               ('pca_mach',\n",
+       "                                                                                PCAEmbeddingNode(n_components=5,\n",
+       "                                                                                                 params={'base_name': 'Base_2_2',\n",
+       "                                                                                                         'field_name': 'mach',\n",
+       "                                                                                                         'n_components': 5}))])))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " Pipeline(steps=[('input_scalar_scaler',\n", + " ScalarScalerNode(params={'scalar_names': ['angle_in',\n", + " 'mach_out'],\n", + " 'type': 'MinMaxScaler'})),\n", + " ('pca_nodes',\n", + " PCAEmbeddingNode(n_components=3,\n", + " params={'base_name': 'Base_2_2',\n", + " 'field_name': 'nodes',\n", + " 'n_components': 3}))])),\n", + " ('regressor',\n", + " DatasetTargetTransformerRegressor(regressor=GPRegressorNo...\n", + " 'output': {'vector_names': ['reduced_mach']},\n", + " 'type': 'GaussianProcessRegressor'}),\n", + " transformer=Pipeline(steps=[('output_scalar_scaler',\n", + " ScalarScalerNode(params={'scalar_names': ['Q',\n", + " 'power',\n", + " 'Pr',\n", + " 'Tr',\n", + " 'eth_is',\n", + " 'angle_out'],\n", + " 'type': 'MinMaxScaler'})),\n", + " ('pca_mach',\n", + " PCAEmbeddingNode(n_components=5,\n", + " params={'base_name': 'Base_2_2',\n", + " 'field_name': 'mach',\n", + " 'n_components': 5}))])))])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline = Pipeline(\n", + " steps=[\n", + " (\"preprocessor\", preprocessor),\n", + " (\"regressor\", regressor),\n", + " ]\n", + ")\n", + "pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 Pipeline(steps=[('input_scalar_scaler',\n",
+       "                                  ScalarScalerNode(params={'scalar_names': ['angle_in',\n",
+       "                                                                            'mach_out'],\n",
+       "                                                           'type': 'MinMaxScaler'})),\n",
+       "                                 ('pca_nodes',\n",
+       "                                  PCAEmbeddingNode(n_components=3,\n",
+       "                                                   params={'base_name': 'Base_2_2',\n",
+       "                                                           'field_name': 'nodes',\n",
+       "                                                           'n_components': 3}))])),\n",
+       "                ('regressor',\n",
+       "                 DatasetTargetTransformerRegressor(regressor=GPRegressorNo...\n",
+       "                                                                                     'output': {'vector_names': ['reduced_mach']},\n",
+       "                                                                                     'type': 'GaussianProcessRegressor'}),\n",
+       "                                                   transformer=Pipeline(steps=[('output_scalar_scaler',\n",
+       "                                                                                ScalarScalerNode(params={'scalar_names': ['Q',\n",
+       "                                                                                                                          'power',\n",
+       "                                                                                                                          'Pr',\n",
+       "                                                                                                                          'Tr',\n",
+       "                                                                                                                          'eth_is',\n",
+       "                                                                                                                          'angle_out'],\n",
+       "                                                                                                         'type': 'MinMaxScaler'})),\n",
+       "                                                                               ('pca_mach',\n",
+       "                                                                                PCAEmbeddingNode(n_components=5,\n",
+       "                                                                                                 params={'base_name': 'Base_2_2',\n",
+       "                                                                                                         'field_name': 'mach',\n",
+       "                                                                                                         'n_components': 5}))])))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " Pipeline(steps=[('input_scalar_scaler',\n", + " ScalarScalerNode(params={'scalar_names': ['angle_in',\n", + " 'mach_out'],\n", + " 'type': 'MinMaxScaler'})),\n", + " ('pca_nodes',\n", + " PCAEmbeddingNode(n_components=3,\n", + " params={'base_name': 'Base_2_2',\n", + " 'field_name': 'nodes',\n", + " 'n_components': 3}))])),\n", + " ('regressor',\n", + " DatasetTargetTransformerRegressor(regressor=GPRegressorNo...\n", + " 'output': {'vector_names': ['reduced_mach']},\n", + " 'type': 'GaussianProcessRegressor'}),\n", + " transformer=Pipeline(steps=[('output_scalar_scaler',\n", + " ScalarScalerNode(params={'scalar_names': ['Q',\n", + " 'power',\n", + " 'Pr',\n", + " 'Tr',\n", + " 'eth_is',\n", + " 'angle_out'],\n", + " 'type': 'MinMaxScaler'})),\n", + " ('pca_mach',\n", + " PCAEmbeddingNode(n_components=5,\n", + " params={'base_name': 'Base_2_2',\n", + " 'field_name': 'mach',\n", + " 'n_components': 5}))])))])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(dataset_train, dataset_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# pipeline.get_params()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3\n", + "5\n" + ] + } + ], + "source": [ + "print(pipeline.get_params()['preprocessor__pca_nodes__n_components'])\n", + "print(pipeline.get_params()['regressor__transformer__pca_mach__n_components'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 4 candidates, totalling 12 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:45,338:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,338:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,339:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,339:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,340:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,340:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,340:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,341:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,341:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,341:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,342:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,342:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,342:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,343:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,343:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,343:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,344:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,344:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,344:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,344:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,345:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,345:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,345:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,346:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,346:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,346:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,347:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,347:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,347:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,348:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,348:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,348:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,348:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:45,349:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 1/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=4;, score=nan total time= 0.9s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:46,292:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,292:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,293:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,293:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,293:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,294:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,294:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,294:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,295:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,295:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,295:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,296:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,296:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,296:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,296:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,297:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,297:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,297:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,298:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,298:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,298:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,299:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,299:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,299:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,300:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,300:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,300:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,300:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,301:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,301:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,301:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,302:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:46,302:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 2/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=4;, score=nan total time= 1.0s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:47,137:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,138:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,138:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,138:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,139:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,139:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,139:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,140:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,140:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,140:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,141:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,141:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,141:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,142:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,142:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,142:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,142:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,143:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,143:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,143:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,144:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,144:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,144:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,145:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,145:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,145:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,145:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,146:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,146:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,146:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,147:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,147:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:47,147:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 3/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=4;, score=nan total time= 0.9s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:48,151:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,152:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,152:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,153:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,153:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,153:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,154:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,154:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,154:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,155:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,155:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,155:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,156:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,156:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,156:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,156:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,157:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,157:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,157:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,158:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,158:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,158:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,159:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,159:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,159:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,160:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,160:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,160:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,160:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,161:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,161:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,161:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,162:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:48,162:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 1/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=5;, score=nan total time= 0.9s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:49,205:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,206:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,206:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,207:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,207:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,207:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,208:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,208:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,208:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,209:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,209:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,209:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,210:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,210:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,210:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,211:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,211:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,211:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,211:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,212:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,212:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,212:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,213:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,213:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,213:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,214:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,214:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,214:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,214:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,215:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,215:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,215:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:49,216:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 2/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=5;, score=nan total time= 1.1s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:50,098:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,099:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,099:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,100:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,100:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,100:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,101:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,101:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,101:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,101:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,102:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,102:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,102:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,103:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,103:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,103:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,104:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,104:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,104:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,104:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,105:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,105:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,105:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,106:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,106:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,106:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,107:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,107:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,107:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,107:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,108:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,108:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,108:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 3/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=5;, score=nan total time= 0.9s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:50,984:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,984:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,985:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,985:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,985:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,986:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,986:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,986:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,987:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,987:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,987:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,988:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,988:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,988:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,989:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,989:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,989:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,990:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,990:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,990:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,990:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,991:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,991:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,991:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,992:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,992:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,992:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,993:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,993:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,993:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,993:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,994:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,994:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:50,994:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 1/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=4;, score=nan total time= 0.9s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:51,868:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,868:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,869:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,869:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,870:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,870:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,870:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,870:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,871:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,871:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,871:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,872:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,872:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,872:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,873:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,873:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,873:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,874:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,874:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,874:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,874:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,875:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,875:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,875:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,876:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,876:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,876:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,877:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,877:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,877:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,877:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,878:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:51,878:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 2/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=4;, score=nan total time= 0.9s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:52,666:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,667:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,667:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,668:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,668:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,668:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,669:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,669:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,669:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,670:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,670:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,670:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,671:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,671:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,671:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,671:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,672:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,672:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,672:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,673:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,673:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,673:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,674:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,674:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,674:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,674:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,675:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,675:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,675:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,676:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,676:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,676:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:52,677:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 3/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=4;, score=nan total time= 0.8s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:53,603:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,603:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,604:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,605:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,605:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,605:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,606:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,606:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,606:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,607:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,607:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,607:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,608:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,608:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,608:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,608:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,609:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,609:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,609:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,610:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,610:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,610:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,611:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,611:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,611:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,611:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,612:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,612:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,612:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,613:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,613:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,613:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,614:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:53,614:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 1/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=5;, score=nan total time= 0.9s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:54,486:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,486:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,487:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,487:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,487:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,488:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,488:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,488:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,489:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,489:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,489:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,490:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,490:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,490:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,491:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,491:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,491:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,491:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,492:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,492:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,492:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,493:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,493:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,493:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,494:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,494:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,494:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,494:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,495:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,495:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,495:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,496:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:54,496:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 2/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=5;, score=nan total time= 1.0s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2025-07-16 18:19:55,422:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,423:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,423:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,424:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,424:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,424:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,425:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,425:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,425:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,426:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,426:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,427:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,427:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,427:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,428:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,428:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,428:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,429:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,429:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,429:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,430:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,430:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,430:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,431:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,431:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,431:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,432:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,432:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,432:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,433:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,433:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,433:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n", + "[2025-07-16 18:19:55,434:WARNING:sample.py:add_field(1685)]:field node with name mach already exists -> data will be replaced\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 3/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=5;, score=nan total time= 0.8s\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=3,\n",
+       "             estimator=Pipeline(steps=[('preprocessor',\n",
+       "                                        Pipeline(steps=[('input_scalar_scaler',\n",
+       "                                                         ScalarScalerNode(params={'scalar_names': ['angle_in',\n",
+       "                                                                                                   'mach_out'],\n",
+       "                                                                                  'type': 'MinMaxScaler'})),\n",
+       "                                                        ('pca_nodes',\n",
+       "                                                         PCAEmbeddingNode(n_components=3,\n",
+       "                                                                          params={'base_name': 'Base_2_2',\n",
+       "                                                                                  'field_name': 'nodes',\n",
+       "                                                                                  'n_components': 3}))])),\n",
+       "                                       ('regressor',\n",
+       "                                        DatasetTargetTransformerRegre...\n",
+       "                                                                                                       ScalarScalerNode(params={'scalar_names': ['Q',\n",
+       "                                                                                                                                                 'power',\n",
+       "                                                                                                                                                 'Pr',\n",
+       "                                                                                                                                                 'Tr',\n",
+       "                                                                                                                                                 'eth_is',\n",
+       "                                                                                                                                                 'angle_out'],\n",
+       "                                                                                                                                'type': 'MinMaxScaler'})),\n",
+       "                                                                                                      ('pca_mach',\n",
+       "                                                                                                       PCAEmbeddingNode(n_components=5,\n",
+       "                                                                                                                        params={'base_name': 'Base_2_2',\n",
+       "                                                                                                                                'field_name': 'mach',\n",
+       "                                                                                                                                'n_components': 5}))])))]),\n",
+       "             param_grid={'preprocessor__pca_nodes__n_components': [2, 3],\n",
+       "                         'regressor__transformer__pca_mach__n_components': [4,\n",
+       "                                                                            5]},\n",
+       "             verbose=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.