update

zytx121 · zytx121 · commit 8ddb245f8a6e · 2022-12-29T10:21:01.000+08:00
diff --git a/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py b/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py
@@ -145,27 +145,29 @@
         file_client_args=dict(backend='disk')),
     dict(
         type='TestTimeAug',
-        transforms=[[
-            dict(type='RandomFlip', prob=1.),
-            dict(type='RandomFlip', prob=0.)
-        ],
-                    [
-                        dict(
-                            type='RandomCenterCropPad',
-                            ratios=None,
-                            border=None,
-                            mean=[0, 0, 0],
-                            std=[1, 1, 1],
-                            to_rgb=True,
-                            test_mode=True,
-                            test_pad_mode=['logical_or', 31],
-                            test_pad_add_pix=1),
-                    ],
-                    [
-                        dict(
-                            type='PackDetInputs',
-                            meta_keys=('img_id', 'img_path', 'ori_shape',
-                                       'img_shape', 'flip', 'flip_direction',
-                                       'border'))
-                    ]])
+        transforms=[
+            [
+                # ``RandomFlip`` must be placed before ``RandomCenterCropPad``
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='RandomCenterCropPad',
+                    ratios=None,
+                    border=None,
+                    mean=[0, 0, 0],
+                    std=[1, 1, 1],
+                    to_rgb=True,
+                    test_mode=True,
+                    test_pad_mode=['logical_or', 31],
+                    test_pad_add_pix=1),
+            ],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'flip', 'flip_direction', 'border'))
+            ]
+        ])
 ]
diff --git a/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py b/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py
@@ -191,25 +191,28 @@
     dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
     dict(
         type='TestTimeAug',
-        transforms=[[
-            dict(type='Resize', scale=(640, 640), keep_ratio=True),
-            dict(type='Resize', scale=(672, 672), keep_ratio=True),
-            dict(type='Resize', scale=(608, 608), keep_ratio=True),
-        ], [
-            dict(type='RandomFlip', prob=1.),
-            dict(type='RandomFlip', prob=0.)
-        ],
-                    [
-                        dict(
-                            type='Pad',
-                            size=(640, 640),
-                            pad_val=dict(img=(114, 114, 114))),
-                    ], [dict(type='LoadAnnotations', with_bbox=True)],
-                    [
-                        dict(
-                            type='PackDetInputs',
-                            meta_keys=('img_id', 'img_path', 'ori_shape',
-                                       'img_shape', 'scale_factor', 'flip',
-                                       'flip_direction'))
-                    ]])
+        transforms=[
+            [
+                dict(type='Resize', scale=(640, 640), keep_ratio=True),
+                dict(type='Resize', scale=(672, 672), keep_ratio=True),
+                dict(type='Resize', scale=(608, 608), keep_ratio=True),
+            ],
+            [
+                # ``RandomFlip`` must be placed before ``Pad``
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='Pad',
+                    size=(640, 640),
+                    pad_val=dict(img=(114, 114, 114))),
+            ],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
 ]
diff --git a/configs/yolox/yolox_s_8xb8-300e_coco.py b/configs/yolox/yolox_s_8xb8-300e_coco.py
@@ -244,25 +244,28 @@
     dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
     dict(
         type='TestTimeAug',
-        transforms=[[
-            dict(type='Resize', scale=(416, 416), keep_ratio=True),
-            dict(type='Resize', scale=(384, 384), keep_ratio=True),
-            dict(type='Resize', scale=(448, 448), keep_ratio=True),
-        ], [
-            dict(type='RandomFlip', prob=1.),
-            dict(type='RandomFlip', prob=0.)
-        ],
-                    [
-                        dict(
-                            type='Pad',
-                            pad_to_square=True,
-                            pad_val=dict(img=(114.0, 114.0, 114.0))),
-                    ],
-                    [
-                        dict(
-                            type='PackDetInputs',
-                            meta_keys=('img_id', 'img_path', 'ori_shape',
-                                       'img_shape', 'scale_factor', 'flip',
-                                       'flip_direction'))
-                    ]])
+        transforms=[
+            [
+                dict(type='Resize', scale=(416, 416), keep_ratio=True),
+                dict(type='Resize', scale=(384, 384), keep_ratio=True),
+                dict(type='Resize', scale=(448, 448), keep_ratio=True),
+            ],
+            [
+                # ``RandomFlip`` must be placed before ``Pad``
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='Pad',
+                    pad_to_square=True,
+                    pad_val=dict(img=(114.0, 114.0, 114.0))),
+            ],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
 ]
diff --git a/mmdet/models/test_time_augs/det_tta.py b/mmdet/models/test_time_augs/det_tta.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
+from typing import List, Tuple
 
 import torch
 from mmcv.ops import batched_nms
 from mmengine.model import BaseTTAModel
 from mmengine.registry import MODELS
 from mmengine.structures import InstanceData
+from torch import Tensor
 
 from mmdet.structures import DetDataSample
 from mmdet.structures.bbox import bbox_flip
@@ -14,13 +15,44 @@
 @MODELS.register_module()
 class DetTTAModel(BaseTTAModel):
     """Merge augmented detection results, only bboxes corresponding score under
-    flipping and multi-scale resizing can be processed now."""
+    flipping and multi-scale resizing can be processed now.
+
+    Examples:
+        >>> tta_model = dict(
+        >>>     type='DetTTAModel',
+        >>>     tta_cfg=dict(nms=dict(
+        >>>                     type='nms',
+        >>>                     iou_threshold=0.5),
+        >>>                     max_per_img=100)
+        >>>
+        >>> tta_pipeline = [
+        >>>     dict(type='LoadImageFromFile',
+        >>>          file_client_args=dict(backend='disk')),
+        >>>     dict(
+        >>>         type='TestTimeAug',
+        >>>         transforms=[[
+        >>>             dict(type='Resize',
+        >>>                  scale=(1333, 800),
+        >>>                  keep_ratio=True),
+        >>>         ], [
+        >>>             dict(type='RandomFlip', prob=1.),
+        >>>             dict(type='RandomFlip', prob=0.)
+        >>>         ], [
+        >>>             dict(
+        >>>                 type='PackDetInputs',
+        >>>                 meta_keys=('img_id', 'img_path', 'ori_shape',
+        >>>                         'img_shape', 'scale_factor', 'flip',
+        >>>                         'flip_direction'))
+        >>>         ]])]
+    """
 
     def __init__(self, tta_cfg=None, **kwargs):
         super().__init__(**kwargs)
         self.tta_cfg = tta_cfg
 
-    def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
+    def merge_aug_bboxes(self, aug_bboxes: List[Tensor],
+                         aug_scores: List[Tensor],
+                         img_metas: List[str]) -> Tuple[Tensor, Tensor]:
         """Merge augmented detection bboxes and scores.
 
         Args:
@@ -50,20 +82,32 @@ def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
             return bboxes, scores
 
     def merge_preds(self, data_samples_list: List[List[DetDataSample]]):
-        """Merge predictions of enhanced data to one prediction.
+        """Merge batch predictions of enhanced data.
 
         Args:
-            data_samples_list (List[List[ClsDataSample]]): List of predictions
-                of all enhanced data.
+            data_samples_list (List[List[DetDataSample]]): List of predictions
+                of all enhanced data. The outer list indicates images, and the
+                inner list corresponds to the different views of one image.
+                Each element of the inner list is a ``DetDataSample``.
         Returns:
-            List[ClsDataSample]: Merged prediction.
+            List[DetDataSample]: Merged batch prediction.
         """
         merged_data_samples = []
         for data_samples in data_samples_list:
             merged_data_samples.append(self._merge_single_sample(data_samples))
         return merged_data_samples
 
-    def _merge_single_sample(self, data_samples):
+    def _merge_single_sample(
+            self, data_samples: List[DetDataSample]) -> DetDataSample:
+        """Merge predictions which come form the different views of one image
+        to one prediction.
+
+        Args:
+            data_samples_list (List[DetDataSample]): List of predictions
+            of enhanced data which come form one image.
+        Returns:
+            List[DetDataSample]: Merged prediction.
+        """
         aug_bboxes = []
         aug_scores = []
         aug_labels = []