From 72e9557bbc3eab57c80fbadb62d2831f25c5eb08 Mon Sep 17 00:00:00 2001 From: Matt Buchovecky Date: Fri, 21 Jun 2024 14:21:46 -0700 Subject: [PATCH 1/3] GH-3474: add random seed parameter to dataset splitting and downsampling functions --- flair/data.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/flair/data.py b/flair/data.py index 1ab3b29f4..1ceb19319 100644 --- a/flair/data.py +++ b/flair/data.py @@ -1301,6 +1301,7 @@ def __init__( test: Optional[Dataset[T_co]] = None, name: str = "corpus", sample_missing_splits: Union[bool, str] = True, + random_seed: Optional[int] = None, ) -> None: # set name self.name: str = name @@ -1314,7 +1315,7 @@ def __init__( test_portion = 0.1 train_length = _len_dataset(train) test_size: int = round(train_length * test_portion) - test, train = randomly_split_into_two_datasets(train, test_size) + test, train = randomly_split_into_two_datasets(train, test_size, random_seed) log.warning( "No test split found. Using %.0f%% (i.e. %d samples) of the train split as test data", test_portion, @@ -1326,7 +1327,7 @@ def __init__( dev_portion = 0.1 train_length = _len_dataset(train) dev_size: int = round(train_length * dev_portion) - dev, train = randomly_split_into_two_datasets(train, dev_size) + dev, train = randomly_split_into_two_datasets(train, dev_size, random_seed) log.warning( "No dev split found. Using %.0f%% (i.e. %d samples) of the train split as dev data", dev_portion, @@ -1353,18 +1354,20 @@ def test(self) -> Optional[Dataset[T_co]]: def downsample( self, percentage: float = 0.1, - downsample_train=True, - downsample_dev=True, - downsample_test=True, + random_seed: Optional[int] = None, + downsample_train: bool = True, + downsample_dev: bool = True, + downsample_test: bool = True, ): + """Reduce all datasets in corpus proportionally to the given percentage.""" if downsample_train and self._train is not None: - self._train = self._downsample_to_proportion(self._train, percentage) + self._train = self._downsample_to_proportion(self._train, percentage, random_seed) if downsample_dev and self._dev is not None: - self._dev = self._downsample_to_proportion(self._dev, percentage) + self._dev = self._downsample_to_proportion(self._dev, percentage, random_seed) if downsample_test and self._test is not None: - self._test = self._downsample_to_proportion(self._test, percentage) + self._test = self._downsample_to_proportion(self._test, percentage, random_seed) return self @@ -1461,9 +1464,9 @@ def _get_all_tokens(self) -> List[str]: return [t.text for t in tokens] @staticmethod - def _downsample_to_proportion(dataset: Dataset, proportion: float): + def _downsample_to_proportion(dataset: Dataset, proportion: float, random_seed: Optional[int] = None) -> Subset: sampled_size: int = round(_len_dataset(dataset) * proportion) - splits = randomly_split_into_two_datasets(dataset, sampled_size) + splits = randomly_split_into_two_datasets(dataset, sampled_size, random_seed=random_seed) return splits[0] def obtain_statistics(self, label_type: Optional[str] = None, pretty_print: bool = True) -> Union[dict, str]: @@ -1879,11 +1882,21 @@ def iob2(tags): return True -def randomly_split_into_two_datasets(dataset, length_of_first): +def randomly_split_into_two_datasets( + dataset: Dataset, length_of_first: int, random_seed: Optional[int] = None +) -> tuple[Subset, Subset]: + """Shuffles a dataset and splits into two subsets. + + The length of the first is specified and the remaining samples go into the second subset. + """ import random indices = list(range(len(dataset))) - random.shuffle(indices) + if random_seed is None: + random.shuffle(indices) + else: + random_generator = random.Random(random_seed) + random_generator.shuffle(indices) first_dataset = indices[:length_of_first] second_dataset = indices[length_of_first:] From b969f03549b81901c3d7f31b0c2385ccada690fe Mon Sep 17 00:00:00 2001 From: Matt Buchovecky Date: Fri, 28 Jun 2024 10:23:47 -0700 Subject: [PATCH 2/3] fix: type hint for compatibility with Python 3.8, and keep new parameter at end of parameters --- flair/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flair/data.py b/flair/data.py index 1ceb19319..3163757f6 100644 --- a/flair/data.py +++ b/flair/data.py @@ -6,7 +6,7 @@ from collections import Counter, defaultdict from operator import itemgetter from pathlib import Path -from typing import Dict, Iterable, List, NamedTuple, Optional, Union, cast +from typing import Dict, Iterable, List, NamedTuple, Optional, Tuple, Union, cast import torch from deprecated.sphinx import deprecated @@ -1354,10 +1354,10 @@ def test(self) -> Optional[Dataset[T_co]]: def downsample( self, percentage: float = 0.1, - random_seed: Optional[int] = None, downsample_train: bool = True, downsample_dev: bool = True, downsample_test: bool = True, + random_seed: Optional[int] = None, ): """Reduce all datasets in corpus proportionally to the given percentage.""" if downsample_train and self._train is not None: @@ -1884,7 +1884,7 @@ def iob2(tags): def randomly_split_into_two_datasets( dataset: Dataset, length_of_first: int, random_seed: Optional[int] = None -) -> tuple[Subset, Subset]: +) -> Tuple[Subset, Subset]: """Shuffles a dataset and splits into two subsets. The length of the first is specified and the remaining samples go into the second subset. From 7ebd20e9287760ab0c6cef3bac346c5a8b254c89 Mon Sep 17 00:00:00 2001 From: Matt Buchovecky Date: Mon, 1 Jul 2024 11:03:57 -0700 Subject: [PATCH 3/3] Change dataset len function to satisfy MyPy Co-authored-by: Alan Akbik --- flair/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/data.py b/flair/data.py index 3163757f6..8a13ae8bd 100644 --- a/flair/data.py +++ b/flair/data.py @@ -1891,7 +1891,7 @@ def randomly_split_into_two_datasets( """ import random - indices = list(range(len(dataset))) + indices = list(range(_len_dataset(dataset))) if random_seed is None: random.shuffle(indices) else: