-
Notifications
You must be signed in to change notification settings - Fork 86
/
verification_rules.py
368 lines (286 loc) · 16.2 KB
/
verification_rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
from typing import Optional
from fedot.core.operations.atomized_model import AtomizedModel
from fedot.core.operations.model import Model
from fedot.core.pipelines.node import PipelineNode
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.operation_types_repository import OperationTypesRepository, get_operations_for_task, \
atomized_model_type
from fedot.core.repository.tasks import Task, TaskTypesEnum
ERROR_PREFIX = 'Invalid pipeline configuration:'
def has_correct_operations_for_task(pipeline: Pipeline, task_type: Optional[TaskTypesEnum] = None):
if task_type and task_type not in pipeline.root_node.operation.acceptable_task_types:
raise ValueError(f'{ERROR_PREFIX} Pipeline has incorrect operations positions')
return True
def has_primary_nodes(pipeline: Pipeline):
if not any(node for node in pipeline.nodes if (isinstance(node, PipelineNode) and node.is_primary)):
raise ValueError(f'{ERROR_PREFIX} Pipeline does not have primary nodes')
return True
def has_final_operation_as_model(pipeline: Pipeline):
""" Check if the operation in root node is model or not """
root_node = pipeline.root_node
if root_node.operation.operation_type == atomized_model_type():
has_final_operation_as_model(root_node.operation.pipeline)
elif not isinstance(root_node.operation, Model):
raise ValueError(f'{ERROR_PREFIX} Root operation is not a model')
return True
def has_no_conflicts_with_data_flow(pipeline: Pipeline):
""" Check if the pipeline contains incorrect connections between nodes """
operation_repo = OperationTypesRepository(operation_type='data_operation')
forbidden_parents_combination = operation_repo.suitable_operation()
forbidden_parents_combination = set(forbidden_parents_combination)
for node in pipeline.nodes:
parent_nodes = node.nodes_from
if parent_nodes is not None and len(parent_nodes) > 1:
# There are several parents
operation_names = []
for parent in parent_nodes:
operation_names.append(parent.operation.operation_type)
# If operations are identical
if len(set(operation_names)) == 1:
# And if it is forbidden to combine them
if operation_names[0] in forbidden_parents_combination:
raise ValueError(f'{ERROR_PREFIX} Pipeline has incorrect subgraph with identical data operations')
return True
def has_correct_data_connections(pipeline: Pipeline):
""" Check if the pipeline contains incorrect connections between operation for different data types """
for node in pipeline.nodes:
# check atomized pipeline
if isinstance(node.operation, AtomizedModel):
has_correct_data_connections(node.operation.pipeline)
# skip custom node
if node.operation.metadata.id == 'custom':
continue
# skip primary node
if node.is_primary:
continue
# check node (also if it is atomized)
types = set(node.operation.metadata.input_types)
for _node in node.nodes_from:
if _node.operation.metadata.id != 'custom':
types &= set(_node.operation.metadata.output_types)
if len(types) == 0:
raise ValueError(f'{ERROR_PREFIX} Pipeline has incorrect subgraph with wrong parent nodes combination')
return True
def has_no_data_flow_conflicts_in_ts_pipeline(pipeline: Pipeline):
""" Function checks the correctness of connection between nodes """
task = Task(TaskTypesEnum.ts_forecasting)
ts_models = get_operations_for_task(task=task, mode='model', tags=["non_lagged"])
non_ts_models = sorted(list(set(get_operations_for_task(task=task, mode='model')).difference(set(ts_models))))
# Preprocessing not only for time series
non_ts_data_operations = get_operations_for_task(task=task,
mode='data_operation',
tags=["non_applicable_for_ts"])
ts_data_operations = get_operations_for_task(task=task,
mode='data_operation',
tags=["non_lagged"])
# Remove lagged and sparse lagged transformation
ts_data_operations.remove('lagged')
ts_data_operations.remove('sparse_lagged')
ts_data_operations.remove('exog_ts')
ts_to_table_operations = ['lagged', 'sparse_lagged', 'exog_ts']
# Dictionary as {'current operation in the node': 'parent operations list'}
wrong_connections = get_wrong_links(ts_to_table_operations, ts_data_operations, non_ts_data_operations,
ts_models, non_ts_models)
limit_parents_count, need_to_have_parent = get_parent_limits(ts_to_table_operations, ts_data_operations,
non_ts_data_operations,
ts_models)
for node in pipeline.nodes:
raise_error = False
if node.is_primary:
# if node is primary then it should use time series
raise_error = DataTypesEnum.ts not in node.operation.metadata.input_types
raise_error |= node.operation.operation_type in need_to_have_parent
else:
# Operation name in the current node
current_operation = node.operation.operation_type
if current_operation in limit_parents_count:
raise_error = limit_parents_count[current_operation] < len(node.nodes_from)
# There are several parents for current node or at least 1
if not raise_error:
forbidden_parents = wrong_connections.get(current_operation)
if forbidden_parents:
parents = set(parent.operation.operation_type for parent in node.nodes_from)
raise_error = set(forbidden_parents) & parents
if raise_error:
raise ValueError(f'{ERROR_PREFIX} Pipeline has incorrect subgraph with wrong parent nodes combination')
return True
def has_correct_location_of_resample(pipeline: Pipeline):
"""
Pipeline can have only one resample operation located in start of the pipeline
:param pipeline: pipeline for checking
"""
is_resample_primary = False
is_not_resample_primary = False
for node in pipeline.nodes:
if node.is_primary:
if node.name == 'resample':
is_resample_primary = True
else:
is_not_resample_primary = True
else:
if node.name == 'resample':
raise ValueError(
f'{ERROR_PREFIX} Pipeline can have only one resample operation located in start of the pipeline')
if is_resample_primary and is_not_resample_primary:
raise ValueError(
f'{ERROR_PREFIX} Pipeline can have only one resample operation located in start of the pipeline')
return True
def get_wrong_links(ts_to_table_operations: list, ts_data_operations: list, non_ts_data_operations: list,
ts_models: list, non_ts_models: list) -> dict:
"""
Function that return wrong ts connections like op_A : [op_B, op_C] that means op_B and op_C
can't be a parent for op_A.
:param ts_to_table_operations: list of ts_to_table operations
:param ts_data_operations: list of ts data operations
:param non_ts_data_operations: list of non ts data operations
:param ts_models: list of ts models
:param non_ts_models: list of non ts models
:return: dict with wrong connections
"""
limit_lagged_parents = {lagged_op: ts_models + non_ts_models + non_ts_data_operations + ts_to_table_operations
for lagged_op in ts_to_table_operations}
limit_ts_models_parents = {ts_model: ts_models + non_ts_models + non_ts_data_operations + ts_to_table_operations
for ts_model in ts_models}
limit_non_ts_models_parents = {non_ts_model: ts_data_operations
for non_ts_model in non_ts_models}
limit_ts_data_operations_parents = {
ts_data_op: ts_models + non_ts_models + non_ts_data_operations + ts_to_table_operations
for ts_data_op in ts_data_operations}
limit_non_ts_data_operations_parents = {non_ts_data_op: ts_data_operations
for non_ts_data_op in non_ts_data_operations}
wrong_connections = {**limit_non_ts_data_operations_parents,
**limit_ts_data_operations_parents,
**limit_non_ts_models_parents,
**limit_ts_models_parents,
**limit_lagged_parents}
return wrong_connections
def get_parent_limits(ts_to_table_operations: list, ts_data_operations: list, non_ts_data_operations: list,
ts_models: list) -> (dict, list):
"""
Function that return some constraints on number of parents for time series forecasting graphs
:param ts_to_table_operations: list of ts_to_table operations
:param ts_data_operations: list of ts data operations
:param non_ts_data_operations: list of non ts data operations
:param ts_models: list of ts models
:return: dict with parent limits and list with operations that must have a parent
"""
limit_ts_model_data_op_parents_count = {ts_model_op: 1
for ts_model_op in ts_models + ts_data_operations + ts_to_table_operations}
limit_decompose_parents_count = {'decompose': 1}
limit_parents_count = {**limit_ts_model_data_op_parents_count, **limit_decompose_parents_count}
need_to_have_parent = [op for op in non_ts_data_operations]
return limit_parents_count, need_to_have_parent
def only_non_lagged_operations_are_primary(pipeline: Pipeline):
""" Only time series specific operations could be placed in primary nodes """
# Check only primary nodes
for node in pipeline.nodes:
if isinstance(node, PipelineNode) and node.is_primary and \
DataTypesEnum.ts not in node.operation.metadata.input_types:
raise ValueError(
f'{ERROR_PREFIX} Pipeline for forecasting has not non_lagged preprocessing in primary nodes')
return True
def has_no_conflicts_in_decompose(pipeline: Pipeline):
""" The function checks whether the 'class_decompose' or 'decompose' operation has two ancestors """
for decomposer in ['decompose', 'class_decompose']:
decompose_nodes = pipeline.get_nodes_by_name(decomposer)
if len(decompose_nodes) != 0:
# Launch check decomposers
__check_decomposer_has_two_parents(nodes_to_check=decompose_nodes)
__check_decompose_parent_position(nodes_to_check=decompose_nodes)
return True
def has_correct_data_sources(pipeline: Pipeline):
""" Checks that data sources and other nodes are not mixed """
is_data_source_in_names_conds = ['data_source' in str(n) for n in pipeline.nodes if
(isinstance(n, PipelineNode) and n.is_primary)]
if any(is_data_source_in_names_conds) and not all(is_data_source_in_names_conds):
raise ValueError(f'{ERROR_PREFIX} Data sources are mixed with other primary nodes')
return True
def has_parent_contain_single_resample(pipeline: Pipeline):
""" 'Resample' should be single parent node for child operation. """
for node in pipeline.nodes:
if node.operation.operation_type == 'resample':
children_nodes = pipeline.node_children(node)
for child_node in children_nodes:
if len(child_node.nodes_from) > 1:
raise ValueError(f'{ERROR_PREFIX} Resample node is not single parent node for child operation')
return True
def has_no_conflicts_during_multitask(pipeline: Pipeline):
"""
Now if the classification task is solved, one part of the pipeline can solve
the regression task if used after class_decompose. If class_decompose is followed
by a classification operation, then this pipelining is incorrect.
Validation perform only for classification pipelines.
"""
classification_operations = get_operations_for_task(task=Task(TaskTypesEnum.classification), mode='all')
pipeline_operations = [node.operation.operation_type for node in pipeline.nodes]
pipeline_operations = set(pipeline_operations)
number_of_unique_pipeline_operations = len(pipeline_operations)
pipeline_operations_for_classification = set(classification_operations).intersection(pipeline_operations)
if len(pipeline_operations_for_classification) == 0:
return True
if 'class_decompose' not in pipeline_operations:
# There are no decompose operations in the pipeline
if number_of_unique_pipeline_operations != len(pipeline_operations_for_classification):
# There are operations in the pipeline that solve different tasks
__check_multitask_operation_location(pipeline, classification_operations)
return True
def has_no_conflicts_after_class_decompose(pipeline: Pipeline):
"""
After the class_decompose operation, a regression model is required.
Validation perform only for classification pipelines.
"""
error_message = f'{ERROR_PREFIX} After classification decompose it is required to use regression model'
pipeline_operations = [node.operation.operation_type for node in pipeline.nodes]
if 'class_decompose' not in pipeline_operations:
return True
regression_operations = get_operations_for_task(task=Task(TaskTypesEnum.regression), mode='all')
# Check for correct descendants after classification decompose
for node in pipeline.nodes:
parent_operations = [node.operation.operation_type for node in node.nodes_from]
if 'class_decompose' in parent_operations:
# Check is this model for regression task
if node.operation.operation_type not in regression_operations:
raise ValueError(error_message)
return True
def __check_decompose_parent_position(nodes_to_check: list):
""" Function check if the data flow before decompose operation is correct
or not
:param nodes_to_check: list with decompose nodes in the pipeline
"""
for decompose_node in nodes_to_check:
parents = decompose_node.nodes_from
model_parent = parents[0]
if not isinstance(model_parent.operation, Model):
raise ValueError(f'{ERROR_PREFIX} For decompose operation Model as first parent is required')
def __check_decomposer_has_two_parents(nodes_to_check: list):
""" Function check if there are two parent nodes for decompose operation
:param nodes_to_check: list with decompose nodes in the pipeline
"""
for decompose_node in nodes_to_check:
parents = decompose_node.nodes_from
if parents is None:
raise ValueError('Decompose operation has no parents')
elif len(parents) != 2:
raise ValueError(f'{ERROR_PREFIX} Two parents for decompose node were'
f' expected, but {len(parents)} were given')
def __check_multitask_operation_location(pipeline: Pipeline, operations_for_classification: list):
"""
Investigate paths for different tasks in the pipeline. If the pipeline solves
several tasks simultaneously and there are no transitive operations in its
structure (e.g. class_decompose), then the side branches must start from the
primary node (nodes)
"""
# TODO refactor to implement check via PipelineStructureExplorer
primary_operations = []
for node in pipeline.nodes:
if isinstance(node, PipelineNode) and node.is_primary:
primary_operations.append(node.operation.operation_type)
primary_operations = set(primary_operations)
unique_primary_operations_number = len(primary_operations)
primary_operations_for_classification = set(operations_for_classification).intersection(primary_operations)
if unique_primary_operations_number != len(primary_operations_for_classification):
# There are difference in tasks are in the primary nodes
return True
else:
raise ValueError(f'{ERROR_PREFIX} Current pipeline can not solve multitask problem')