Skip to content

Commit

Permalink
fixes for indexing
Browse files Browse the repository at this point in the history
 - prevent parsing index as date for time series for consistency;
 - try to define index column even outside "columns_to_use".
  • Loading branch information
MorrisNein committed Dec 26, 2022
1 parent cf882d1 commit 8406523
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 22 deletions.
43 changes: 24 additions & 19 deletions fedot/core/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,7 @@ def from_csv_time_series(cls,
if isinstance(task, str):
task = Task(TaskTypesEnum(task))

df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_drop=columns_to_drop,
parse_index_as_datetime=True)
df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_drop=columns_to_drop)
idx = df.index.to_numpy()

if target_column is not None:
Expand Down Expand Up @@ -185,8 +184,7 @@ def from_csv_multi_time_series(cls,
An instance of :class:`InputData`.
"""

df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use,
parse_index_as_datetime=True)
df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use)
idx = df.index.to_numpy()
if columns_to_use is not None:
actual_df = df[columns_to_use]
Expand Down Expand Up @@ -605,11 +603,20 @@ def autodetect_data_type(task: Task) -> DataTypesEnum:


def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Union[str, int]] = None,
possible_idx_keywords: Optional[List[str]] = None,
*, columns_to_drop: Optional[List[Union[str, int]]] = None,
columns_to_use: Optional[List[Union[str, int]]] = None, parse_index_as_datetime: bool = False):
columns_to_drop = columns_to_drop or []
columns_to_use = columns_to_use or []
possible_idx_keywords: Optional[List[str]] = None, *,
columns_to_drop: Optional[List[Union[str, int]]] = None,
columns_to_use: Optional[List[Union[str, int]]] = None):

def define_index_column(candidate_columns: List[str]) -> Optional[str]:
for column_name in candidate_columns:
if is_column_name_suitable_for_index(column_name):
return column_name

def is_column_name_suitable_for_index(column_name: str) -> bool:
return any(key in column_name.lower() for key in possible_idx_keywords)

columns_to_drop = copy(columns_to_drop) or []
columns_to_use = copy(columns_to_use) or []
possible_idx_keywords = possible_idx_keywords or []

logger = default_log('CSV data extraction')
Expand All @@ -623,16 +630,14 @@ def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Uni
if columns_to_drop:
columns_to_use = [col for col in columns if col not in columns_to_drop]
elif not columns_to_use:
columns_to_use = columns
columns_to_use = list(columns)

if index_col is None:
first_column = columns_to_use[0]
if any(key in first_column.lower() for key in possible_idx_keywords):
logger.message(f'Used the column as index: "{first_column}".')
index_col = first_column
candidate_idx_cols = [columns_to_use[0], columns[0]]
if ((index_col is None) and
(index_col := define_index_column(candidate_idx_cols)) is not None):
logger.message(f'Used the column as index: "{index_col}".')

df = pd.read_csv(file_path, sep=delimiter, index_col=index_col, usecols=columns_to_use)
if (index_col is not None) and (index_col not in columns_to_use):
columns_to_use.append(index_col)

if parse_index_as_datetime and index_col:
df.index = pd.to_datetime(df.index).astype(str)
return df
return pd.read_csv(file_path, sep=delimiter, index_col=index_col, usecols=columns_to_use)
3 changes: 1 addition & 2 deletions fedot/core/data/multi_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,7 @@ def from_csv_time_series(cls,
if isinstance(task, str):
task = Task(TaskTypesEnum(task))

df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use,
parse_index_as_datetime=True)
df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use)
idx = df.index.to_numpy()
if not columns_to_use:
columns_to_use = list(set(df.columns) - set(index_col))
Expand Down
2 changes: 1 addition & 1 deletion test/data/remote/remote_config_ts_multivar
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pipeline_template = { "total_pipeline_operations": [ "data_source_ts/velocity
train_data = {fedot_base_path}/test/data/multivar_ts.csv
task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=1))
output_path = ./test_ts_multivar
train_data_idx = ["2019-05-01 00:00:00","2019-05-02 00:00:00","2019-05-03 00:00:00","2019-05-04 00:00:00","2019-05-05 00:00:00","2019-05-06 00:00:00","2019-05-07 00:00:00","2019-05-08 00:00:00","2019-05-09 00:00:00","2019-05-10 00:00:00","2019-05-11 00:00:00","2019-05-12 00:00:00","2019-05-13 00:00:00","2019-05-14 00:00:00","2019-05-15 00:00:00"]
train_data_idx = ["2019-05-01","2019-05-02","2019-05-03","2019-05-04","2019-05-05","2019-05-06","2019-05-07","2019-05-08","2019-05-09","2019-05-10","2019-05-11","2019-05-12","2019-05-13","2019-05-14","2019-05-15"]
var_names = ["diesel_fuel_kWh","wind_power_kWh","diesel_time_h","wind_time_h","velocity_max_msec","velocity_mean_msec","tmp_grad"]
is_multi_modal = "True"
target = "diesel_fuel_kWh"
Expand Down

0 comments on commit 8406523

Please sign in to comment.