[Enhance] Combined dataset supports custom sampling ratio (#2562)

Ben-Louis · web-flow · commit 4233a611beba · 2023-07-21T16:48:58.000+08:00
diff --git a/docs/en/user_guides/mixed_datasets.md b/docs/en/user_guides/mixed_datasets.md
@@ -84,6 +84,11 @@ dataset = dict(
     # The pipeline includes typical transforms, such as loading the
     # image and data augmentation
     pipeline=train_pipeline,
+    # The sample_ratio_factor controls the sampling ratio of
+    # each dataset in the combined dataset. The length of sample_ratio_factor
+    # should match the number of datasets. Each factor indicates the sampling
+    # ratio of the corresponding dataset relative to its original length.
+    sample_ratio_factor=[1.0, 0.5]
 )
 ```
 
diff --git a/docs/zh_cn/user_guides/mixed_datasets.md b/docs/zh_cn/user_guides/mixed_datasets.md
@@ -84,6 +84,9 @@ dataset = dict(
     # `train_pipeline` 包含了常用的数据预处理，
     # 比如图片读取、数据增广等
     pipeline=train_pipeline,
+    # sample_ratio_factor 参数是用来调节每个子数据集
+    # 在组合数据集中的样本数量比例的
+    sample_ratio_factor=[1.0, 0.5]
 )
 ```
 
diff --git a/mmpose/datasets/dataset_wrappers.py b/mmpose/datasets/dataset_wrappers.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from copy import deepcopy
-from typing import Any, Callable, List, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 
+import numpy as np
 from mmengine.dataset import BaseDataset
 from mmengine.registry import build_from_cfg
 
@@ -18,21 +19,37 @@ class CombinedDataset(BaseDataset):
         metainfo (dict): The meta information of combined dataset.
         datasets (list): The configs of datasets to be combined.
         pipeline (list, optional): Processing pipeline. Defaults to [].
+        sample_ratio_factor (list, optional): A list of sampling ratio
+            factors for each dataset. Defaults to None
     """
 
     def __init__(self,
                  metainfo: dict,
                  datasets: list,
                  pipeline: List[Union[dict, Callable]] = [],
+                 sample_ratio_factor: Optional[List[float]] = None,
                  **kwargs):
 
         self.datasets = []
+        self.resample = sample_ratio_factor is not None
 
         for cfg in datasets:
             dataset = build_from_cfg(cfg, DATASETS)
             self.datasets.append(dataset)
 
         self._lens = [len(dataset) for dataset in self.datasets]
+        if self.resample:
+            assert len(sample_ratio_factor) == len(datasets), f'the length ' \
+                f'of `sample_ratio_factor` {len(sample_ratio_factor)} does ' \
+                f'not match the length of `datasets` {len(datasets)}'
+            assert min(sample_ratio_factor) >= 0.0, 'the ratio values in ' \
+                '`sample_ratio_factor` should not be negative.'
+            self._lens_ori = self._lens
+            self._lens = [
+                round(l * sample_ratio_factor[i])
+                for i, l in enumerate(self._lens_ori)
+            ]
+
         self._len = sum(self._lens)
 
         super(CombinedDataset, self).__init__(pipeline=pipeline, **kwargs)
@@ -71,6 +88,12 @@ def _get_subset_index(self, index: int) -> Tuple[int, int]:
         while index >= self._lens[subset_index]:
             index -= self._lens[subset_index]
             subset_index += 1
+
+        if self.resample:
+            gap = (self._lens_ori[subset_index] -
+                   1e-4) / self._lens[subset_index]
+            index = round(gap * index + np.random.rand() * gap - 0.5)
+
         return subset_index, index
 
     def prepare_data(self, idx: int) -> Any:
diff --git a/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py b/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py
@@ -81,6 +81,29 @@ def test_get_subset_index(self):
         self.assertEqual(subset_idx, 0)
         self.assertEqual(sample_idx, lens[0] - 1)
 
+        # combiend dataset with resampling ratio
+        dataset = self.build_combined_dataset(sample_ratio_factor=[1, 0.3])
+        self.assertEqual(
+            len(dataset),
+            len(dataset.datasets[0]) + round(0.3 * len(dataset.datasets[1])))
+        lens = dataset._lens
+
+        index = lens[0]
+        subset_idx, sample_idx = dataset._get_subset_index(index)
+        self.assertEqual(subset_idx, 1)
+        self.assertIn(sample_idx, (0, 1, 2))
+
+        index = -lens[1] - 1
+        subset_idx, sample_idx = dataset._get_subset_index(index)
+        self.assertEqual(subset_idx, 0)
+        self.assertEqual(sample_idx, lens[0] - 1)
+
+        with self.assertRaises(AssertionError):
+            _ = self.build_combined_dataset(sample_ratio_factor=[1, 0.3, 0.1])
+
+        with self.assertRaises(AssertionError):
+            _ = self.build_combined_dataset(sample_ratio_factor=[1, -0.3])
+
     def test_prepare_data(self):
         dataset = self.build_combined_dataset()
         lens = dataset._lens

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,9 @@ dataset = dict(`
`84`	`84`	# `train_pipeline` 包含了常用的数据预处理，
`85`	`85`	`# 比如图片读取、数据增广等`
`86`	`86`	`pipeline=train_pipeline,`
	`87`	`+ # sample_ratio_factor 参数是用来调节每个子数据集`
	`88`	`+ # 在组合数据集中的样本数量比例的`
	`89`	`+ sample_ratio_factor=[1.0, 0.5]`
`87`	`90`	`)`
`88`	`91`	```
`89`	`92`