fixes for indexing

- prevent parsing index as date for time series for consistency; - try to define index column even outside "columns_to_use".
aimclub · Dec 26, 2022 · 8406523 · 8406523
1 parent cf882d1
commit 8406523
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 22 deletions.
diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py
@@ -124,8 +124,7 @@ def from_csv_time_series(cls,
         if isinstance(task, str):
             task = Task(TaskTypesEnum(task))
 
-        df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_drop=columns_to_drop,
-                             parse_index_as_datetime=True)
+        df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_drop=columns_to_drop)
         idx = df.index.to_numpy()
 
         if target_column is not None:
@@ -185,8 +184,7 @@ def from_csv_multi_time_series(cls,
             An instance of :class:`InputData`.
         """
 
-        df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use,
-                             parse_index_as_datetime=True)
+        df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use)
         idx = df.index.to_numpy()
         if columns_to_use is not None:
             actual_df = df[columns_to_use]
@@ -605,11 +603,20 @@ def autodetect_data_type(task: Task) -> DataTypesEnum:
 
 
 def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Union[str, int]] = None,
-                    possible_idx_keywords: Optional[List[str]] = None,
-                    *, columns_to_drop: Optional[List[Union[str, int]]] = None,
-                    columns_to_use: Optional[List[Union[str, int]]] = None, parse_index_as_datetime: bool = False):
-    columns_to_drop = columns_to_drop or []
-    columns_to_use = columns_to_use or []
+                    possible_idx_keywords: Optional[List[str]] = None, *,
+                    columns_to_drop: Optional[List[Union[str, int]]] = None,
+                    columns_to_use: Optional[List[Union[str, int]]] = None):
+
+    def define_index_column(candidate_columns: List[str]) -> Optional[str]:
+        for column_name in candidate_columns:
+            if is_column_name_suitable_for_index(column_name):
+                return column_name
+
+    def is_column_name_suitable_for_index(column_name: str) -> bool:
+        return any(key in column_name.lower() for key in possible_idx_keywords)
+
+    columns_to_drop = copy(columns_to_drop) or []
+    columns_to_use = copy(columns_to_use) or []
     possible_idx_keywords = possible_idx_keywords or []
 
     logger = default_log('CSV data extraction')
@@ -623,16 +630,14 @@ def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Uni
     if columns_to_drop:
         columns_to_use = [col for col in columns if col not in columns_to_drop]
     elif not columns_to_use:
-        columns_to_use = columns
+        columns_to_use = list(columns)
 
-    if index_col is None:
-        first_column = columns_to_use[0]
-        if any(key in first_column.lower() for key in possible_idx_keywords):
-            logger.message(f'Used the column as index: "{first_column}".')
-            index_col = first_column
+    candidate_idx_cols = [columns_to_use[0], columns[0]]
+    if ((index_col is None) and
+            (index_col := define_index_column(candidate_idx_cols)) is not None):
+        logger.message(f'Used the column as index: "{index_col}".')
 
-    df = pd.read_csv(file_path, sep=delimiter, index_col=index_col, usecols=columns_to_use)
+    if (index_col is not None) and (index_col not in columns_to_use):
+        columns_to_use.append(index_col)
 
-    if parse_index_as_datetime and index_col:
-        df.index = pd.to_datetime(df.index).astype(str)
-    return df
+    return pd.read_csv(file_path, sep=delimiter, index_col=index_col, usecols=columns_to_use)
diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py
@@ -194,8 +194,7 @@ def from_csv_time_series(cls,
         if isinstance(task, str):
             task = Task(TaskTypesEnum(task))
 
-        df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use,
-                             parse_index_as_datetime=True)
+        df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use)
         idx = df.index.to_numpy()
         if not columns_to_use:
             columns_to_use = list(set(df.columns) - set(index_col))

diff --git a/test/data/remote/remote_config_ts_multivar b/test/data/remote/remote_config_ts_multivar
@@ -3,7 +3,7 @@ pipeline_template =   {	"total_pipeline_operations": [		"data_source_ts/velocity
 train_data = {fedot_base_path}/test/data/multivar_ts.csv
 task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=1))
 output_path = ./test_ts_multivar
-train_data_idx = ["2019-05-01 00:00:00","2019-05-02 00:00:00","2019-05-03 00:00:00","2019-05-04 00:00:00","2019-05-05 00:00:00","2019-05-06 00:00:00","2019-05-07 00:00:00","2019-05-08 00:00:00","2019-05-09 00:00:00","2019-05-10 00:00:00","2019-05-11 00:00:00","2019-05-12 00:00:00","2019-05-13 00:00:00","2019-05-14 00:00:00","2019-05-15 00:00:00"]
+train_data_idx = ["2019-05-01","2019-05-02","2019-05-03","2019-05-04","2019-05-05","2019-05-06","2019-05-07","2019-05-08","2019-05-09","2019-05-10","2019-05-11","2019-05-12","2019-05-13","2019-05-14","2019-05-15"]
 var_names = ["diesel_fuel_kWh","wind_power_kWh","diesel_time_h","wind_time_h","velocity_max_msec","velocity_mean_msec","tmp_grad"]
 is_multi_modal = "True"
 target = "diesel_fuel_kWh"