Merge pull request #102 from roboflow/feature/dataset_split

SkalskiP · web-flow · commit c02f69428e5d · 2023-05-17T20:22:30.000+02:00
feature/dataset_split
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -2,6 +2,7 @@
 
 - Added [[#100](https://github.com/roboflow/supervision/pull/100)]: support for Dataset inheritance. Current `Dataset` got renamed to `DetectionDataset` and make it inherit from `BaseDataset`.
 - Added [[#100](https://github.com/roboflow/supervision/pull/100)]: ability to save datasets in YOLO format using `DetectionDataset.as_yolo`.
+- Added [[#102](https://github.com/roboflow/supervision/pull/103)]: support for splitting `DetectionDataset`.
 - Changed [[#100](https://github.com/roboflow/supervision/pull/100)]: default value of `approximation_percentage` parameter from `0.75` to `0.0` in `DetectionDataset.as_yolo` and `DetectionDataset.as_pascal_voc`.
 
 ### 0.7.0 <small>May 11, 2023</small>
diff --git a/supervision/dataset/core.py b/supervision/dataset/core.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional, Tuple
@@ -16,14 +17,22 @@
     save_data_yaml,
     save_yolo_annotations,
 )
-from supervision.dataset.ultils import save_dataset_images
+from supervision.dataset.ultils import save_dataset_images, train_test_split
 from supervision.detection.core import Detections
 from supervision.file import list_files_with_extensions
 
 
 @dataclass
-class BaseDataset:
-    pass
+class BaseDataset(ABC):
+    @abstractmethod
+    def __len__(self) -> int:
+        pass
+
+    @abstractmethod
+    def split(
+        self, split_ratio=0.8, random_state=None, shuffle: bool = True
+    ) -> Tuple[BaseDataset, BaseDataset]:
+        pass
 
 
 @dataclass
@@ -61,6 +70,36 @@ def __iter__(self) -> Iterator[Tuple[str, np.ndarray, Detections]]:
         for image_name, image in self.images.items():
             yield image_name, image, self.annotations.get(image_name, None)
 
+    def split(
+        self, split_ratio=0.8, random_state=None, shuffle: bool = True
+    ) -> Tuple[DetectionDataset, DetectionDataset]:
+        """
+        Splits the dataset into two parts using the provided split_ratio.
+
+        Returns:
+            Tuple[DetectionDataset, DetectionDataset]: The split datasets.
+        """
+
+        image_names = list(self.images.keys())
+        train_names, test_names = train_test_split(
+            data=image_names,
+            train_ratio=split_ratio,
+            random_state=random_state,
+            shuffle=shuffle,
+        )
+
+        train_dataset = DetectionDataset(
+            classes=self.classes,
+            images={name: self.images[name] for name in train_names},
+            annotations={name: self.annotations[name] for name in train_names},
+        )
+        test_dataset = DetectionDataset(
+            classes=self.classes,
+            images={name: self.images[name] for name in test_names},
+            annotations={name: self.annotations[name] for name in test_names},
+        )
+        return train_dataset, test_dataset
+
     def as_pascal_voc(
         self,
         images_directory_path: Optional[str] = None,
diff --git a/supervision/dataset/ultils.py b/supervision/dataset/ultils.py
@@ -1,6 +1,7 @@
 import os
+import random
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, Optional, Tuple, TypeVar
 
 import cv2
 import numpy as np
@@ -11,6 +12,8 @@
     mask_to_polygons,
 )
 
+T = TypeVar("T")
+
 
 def approximate_mask_with_polygons(
     mask: np.ndarray,
@@ -48,3 +51,31 @@ def save_dataset_images(
     for image_name, image in images.items():
         target_image_path = os.path.join(images_directory_path, image_name)
         cv2.imwrite(target_image_path, image)
+
+
+def train_test_split(
+    data: List[T],
+    train_ratio: float = 0.8,
+    random_state: Optional[int] = None,
+    shuffle: bool = True,
+) -> Tuple[List[T], List[T]]:
+    """
+    Splits the data into two parts using the provided train_ratio.
+
+    Args:
+        data (List[T]): The data to split.
+        train_ratio (float): The ratio of the training set to the entire dataset.
+        random_state (Optional[int]): The seed for the random number generator.
+        shuffle (bool): Whether to shuffle the data before splitting.
+
+    Returns:
+        Tuple[List[T], List[T]]: The split data.
+    """
+    if random_state is not None:
+        random.seed(random_state)
+
+    if shuffle:
+        random.shuffle(data)
+
+    split_index = int(len(data) * train_ratio)
+    return data[:split_index], data[split_index:]
diff --git a/test/dataset/test_utils.py b/test/dataset/test_utils.py
@@ -0,0 +1,82 @@
+from typing import List, TypeVar, Optional, Tuple
+from contextlib import ExitStack as DoesNotRaise
+
+import pytest
+
+from supervision.dataset.ultils import train_test_split
+
+T = TypeVar("T")
+
+
+@pytest.mark.parametrize(
+    'data, train_ratio, random_state, shuffle, expected_result, exception',
+    [
+        (
+            [],
+            0.5,
+            None,
+            False,
+            ([], []),
+            DoesNotRaise()
+        ),  # empty data
+        (
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            0.5,
+            None,
+            False,
+            ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+            DoesNotRaise()
+        ),  # data with 10 numbers and 50% train split
+        (
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            1.0,
+            None,
+            False,
+            ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], []),
+            DoesNotRaise()
+        ),  # data with 10 numbers and 100% train split
+        (
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            0.0,
+            None,
+            False,
+            ([], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+            DoesNotRaise()
+        ),  # data with 10 numbers and 0% train split
+        (
+            ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'],
+            0.5,
+            None,
+            False,
+            (['a', 'b', 'c', 'd', 'e'], ['f', 'g', 'h', 'i', 'j']),
+            DoesNotRaise()
+        ),  # data with 10 chars and 50% train split
+        (
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            0.5,
+            23,
+            True,
+            ([7, 8, 5, 6, 3], [2, 9, 0, 1, 4]),
+            DoesNotRaise()
+        ),  # data with 10 numbers and 50% train split with 23 random seed
+        (
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            0.5,
+            32,
+            True,
+            ([4, 6, 0, 8, 9], [5, 7, 2, 3, 1]),
+            DoesNotRaise()
+        ),  # data with 10 numbers and 50% train split with 23 random seed
+    ]
+)
+def test_train_test_split(
+    data: List[T],
+    train_ratio: float,
+    random_state: int,
+    shuffle: bool,
+    expected_result: Optional[Tuple[List[T], List[T]]],
+    exception: Exception
+) -> None:
+    with exception:
+        result = train_test_split(data=data, train_ratio=train_ratio, random_state=random_state, shuffle=shuffle)
+        assert result == expected_result