From 72e9557bbc3eab57c80fbadb62d2831f25c5eb08 Mon Sep 17 00:00:00 2001
From: Matt Buchovecky <mbuchove@gmail.com>
Date: Fri, 21 Jun 2024 14:21:46 -0700
Subject: [PATCH 1/3] GH-3474: add random seed parameter to dataset splitting
 and downsampling functions

---
 flair/data.py | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index 1ab3b29f4..1ceb19319 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -1301,6 +1301,7 @@ def __init__(
         test: Optional[Dataset[T_co]] = None,
         name: str = "corpus",
         sample_missing_splits: Union[bool, str] = True,
+        random_seed: Optional[int] = None,
     ) -> None:
         # set name
         self.name: str = name
@@ -1314,7 +1315,7 @@ def __init__(
             test_portion = 0.1
             train_length = _len_dataset(train)
             test_size: int = round(train_length * test_portion)
-            test, train = randomly_split_into_two_datasets(train, test_size)
+            test, train = randomly_split_into_two_datasets(train, test_size, random_seed)
             log.warning(
                 "No test split found. Using %.0f%% (i.e. %d samples) of the train split as test data",
                 test_portion,
@@ -1326,7 +1327,7 @@ def __init__(
             dev_portion = 0.1
             train_length = _len_dataset(train)
             dev_size: int = round(train_length * dev_portion)
-            dev, train = randomly_split_into_two_datasets(train, dev_size)
+            dev, train = randomly_split_into_two_datasets(train, dev_size, random_seed)
             log.warning(
                 "No dev split found. Using %.0f%% (i.e. %d samples) of the train split as dev data",
                 dev_portion,
@@ -1353,18 +1354,20 @@ def test(self) -> Optional[Dataset[T_co]]:
     def downsample(
         self,
         percentage: float = 0.1,
-        downsample_train=True,
-        downsample_dev=True,
-        downsample_test=True,
+        random_seed: Optional[int] = None,
+        downsample_train: bool = True,
+        downsample_dev: bool = True,
+        downsample_test: bool = True,
     ):
+        """Reduce all datasets in corpus proportionally to the given percentage."""
         if downsample_train and self._train is not None:
-            self._train = self._downsample_to_proportion(self._train, percentage)
+            self._train = self._downsample_to_proportion(self._train, percentage, random_seed)
 
         if downsample_dev and self._dev is not None:
-            self._dev = self._downsample_to_proportion(self._dev, percentage)
+            self._dev = self._downsample_to_proportion(self._dev, percentage, random_seed)
 
         if downsample_test and self._test is not None:
-            self._test = self._downsample_to_proportion(self._test, percentage)
+            self._test = self._downsample_to_proportion(self._test, percentage, random_seed)
 
         return self
 
@@ -1461,9 +1464,9 @@ def _get_all_tokens(self) -> List[str]:
         return [t.text for t in tokens]
 
     @staticmethod
-    def _downsample_to_proportion(dataset: Dataset, proportion: float):
+    def _downsample_to_proportion(dataset: Dataset, proportion: float, random_seed: Optional[int] = None) -> Subset:
         sampled_size: int = round(_len_dataset(dataset) * proportion)
-        splits = randomly_split_into_two_datasets(dataset, sampled_size)
+        splits = randomly_split_into_two_datasets(dataset, sampled_size, random_seed=random_seed)
         return splits[0]
 
     def obtain_statistics(self, label_type: Optional[str] = None, pretty_print: bool = True) -> Union[dict, str]:
@@ -1879,11 +1882,21 @@ def iob2(tags):
     return True
 
 
-def randomly_split_into_two_datasets(dataset, length_of_first):
+def randomly_split_into_two_datasets(
+    dataset: Dataset, length_of_first: int, random_seed: Optional[int] = None
+) -> tuple[Subset, Subset]:
+    """Shuffles a dataset and splits into two subsets.
+
+    The length of the first is specified and the remaining samples go into the second subset.
+    """
     import random
 
     indices = list(range(len(dataset)))
-    random.shuffle(indices)
+    if random_seed is None:
+        random.shuffle(indices)
+    else:
+        random_generator = random.Random(random_seed)
+        random_generator.shuffle(indices)
 
     first_dataset = indices[:length_of_first]
     second_dataset = indices[length_of_first:]

From b969f03549b81901c3d7f31b0c2385ccada690fe Mon Sep 17 00:00:00 2001
From: Matt Buchovecky <mbuchove@gmail.com>
Date: Fri, 28 Jun 2024 10:23:47 -0700
Subject: [PATCH 2/3] fix: type hint for compatibility with Python 3.8, and
 keep new parameter at end of parameters

---
 flair/data.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index 1ceb19319..3163757f6 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -6,7 +6,7 @@
 from collections import Counter, defaultdict
 from operator import itemgetter
 from pathlib import Path
-from typing import Dict, Iterable, List, NamedTuple, Optional, Union, cast
+from typing import Dict, Iterable, List, NamedTuple, Optional, Tuple, Union, cast
 
 import torch
 from deprecated.sphinx import deprecated
@@ -1354,10 +1354,10 @@ def test(self) -> Optional[Dataset[T_co]]:
     def downsample(
         self,
         percentage: float = 0.1,
-        random_seed: Optional[int] = None,
         downsample_train: bool = True,
         downsample_dev: bool = True,
         downsample_test: bool = True,
+        random_seed: Optional[int] = None,
     ):
         """Reduce all datasets in corpus proportionally to the given percentage."""
         if downsample_train and self._train is not None:
@@ -1884,7 +1884,7 @@ def iob2(tags):
 
 def randomly_split_into_two_datasets(
     dataset: Dataset, length_of_first: int, random_seed: Optional[int] = None
-) -> tuple[Subset, Subset]:
+) -> Tuple[Subset, Subset]:
     """Shuffles a dataset and splits into two subsets.
 
     The length of the first is specified and the remaining samples go into the second subset.

From 7ebd20e9287760ab0c6cef3bac346c5a8b254c89 Mon Sep 17 00:00:00 2001
From: Matt Buchovecky <mbuchove@gmail.com>
Date: Mon, 1 Jul 2024 11:03:57 -0700
Subject: [PATCH 3/3] Change dataset len function to satisfy MyPy

Co-authored-by: Alan Akbik <alan.akbik@gmail.com>
---
 flair/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/data.py b/flair/data.py
index 3163757f6..8a13ae8bd 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -1891,7 +1891,7 @@ def randomly_split_into_two_datasets(
     """
     import random
 
-    indices = list(range(len(dataset)))
+    indices = list(range(_len_dataset(dataset)))
     if random_seed is None:
         random.shuffle(indices)
     else: