open-mmlab · Tau-J · Sep 6, 2023 · Aug 24, 2023 · Aug 25, 2023 · Aug 29, 2023
diff --git a/docs/en/advanced_guides/codecs.md b/docs/en/advanced_guides/codecs.md
@@ -62,7 +62,23 @@ def encode(self,
     return encoded
 ```
 
-The encoded data is converted to Tensor format in `PackPoseInputs` and packed in `data_sample.gt_instance_labels` for model calls, which is generally used for loss calculation, as demonstrated by `loss()` in `RegressionHead`.
+The encoded data is converted to Tensor format in `PackPoseInputs` and packed in `data_sample.gt_instance_labels` for model calls. By default it will consist of the following encoded fields:
+
+- `keypoint_labels`
+- `keypoint_weights`
+- `keypoints_visible_weights`
+
+To specify data fields to be packed, you can define the `label_mapping_table` attribute in the codec. For example, in `VideoPoseLifting`:
+
+```Python
+label_mapping_table = dict(
+        trajectory_weights='trajectory_weights',
+        lifting_target_label='lifting_target_label',
+        lifting_target_weight='lifting_target_weight',
+)
+```
+
+`data_sample.gt_instance_labels` are generally used for loss calculation, as demonstrated by `loss()` in `RegressionHead`.
 
 ```Python
 def loss(self,
@@ -88,6 +104,10 @@ def loss(self,
     ### Omitted ###
 ```
 
+```{note}
+Encoder also defines data to be packed in `data_sample.gt_instances` and `data_sample.gt_fields`. Modify `instance_mapping_table` and `field_mapping_table` in the codec will specify values to be packed respectively. For default values, please check [BaseKeypointCodec](https://github.com/open-mmlab/mmpose/blob/main/mmpose/codecs/base.py).
+```
+
 ### Decoder
 
 The decoder transforms the model outputs into coordinates in the input image space, which is the opposite processing of the encoder.

diff --git a/docs/zh_cn/advanced_guides/codecs.md b/docs/zh_cn/advanced_guides/codecs.md
@@ -62,7 +62,23 @@ def encode(self,
     return encoded
 ```
 
-编码后的数据会在 `PackPoseInputs` 中被转换为 Tensor 格式，并封装到 `data_sample.gt_instance_labels` 中供模型调用，一般主要用于 loss 计算，下面以 `RegressionHead` 中的 `loss()` 为例：
+编码后的数据会在 `PackPoseInputs` 中被转换为 Tensor 格式，并封装到 `data_sample.gt_instance_labels` 中供模型调用，默认包含以下的字段：
+
+- `keypoint_labels`
+- `keypoint_weights`
+- `keypoints_visible_weights`
+
+如要指定要打包的数据字段，可以在编解码器中定义 `label_mapping_table` 属性。例如，在 `VideoPoseLifting` 中：
+
+```Python
+label_mapping_table = dict(
+        trajectory_weights='trajectory_weights',
+        lifting_target_label='lifting_target_label',
+        lifting_target_weight='lifting_target_weight',
+)
+```
+
+`data_sample.gt_instance_labels` 一般主要用于 loss 计算，下面以 `RegressionHead` 中的 `loss()` 为例：
 
 ```Python
 def loss(self,
@@ -88,6 +104,10 @@ def loss(self,
     ### 后续内容省略 ###
 ```
 
+```{note}
+解码器亦会定义封装在 `data_sample.gt_instances` 和 `data_sample.gt_fields` 中的字段。修改编码器中的 `instance_mapping_table` 和 `field_mapping_table` 的值将分别指定封装的字段，其中默认值定义在 [BaseKeypointCodec](https://github.com/open-mmlab/mmpose/blob/main/mmpose/codecs/base.py) 中。
+```
+
 ### 解码器
 
 解码器主要负责将模型的输出解码为输入图片尺度的坐标值，处理过程与编码器相反。

diff --git a/mmpose/codecs/base.py b/mmpose/codecs/base.py
@@ -18,6 +18,10 @@ class BaseKeypointCodec(metaclass=ABCMeta):
     # mandatory `keypoints` and `keypoints_visible` arguments.
     auxiliary_encode_keys = set()
 
+    field_mapping_table = dict()
+    instance_mapping_table = dict()
+    label_mapping_table = dict()
+
     @abstractmethod
     def encode(self,
                keypoints: np.ndarray,

diff --git a/mmpose/codecs/image_pose_lifting.py b/mmpose/codecs/image_pose_lifting.py
@@ -41,6 +41,15 @@ class ImagePoseLifting(BaseKeypointCodec):
 
     auxiliary_encode_keys = {'lifting_target', 'lifting_target_visible'}
 
+    instance_mapping_table = dict(
+        lifting_target='lifting_target',
+        lifting_target_visible='lifting_target_visible',
+    )
+    label_mapping_table = dict(
+        trajectory_weights='trajectory_weights',
+        lifting_target_label='lifting_target_label',
+        lifting_target_weight='lifting_target_weight')
+
     def __init__(self,
                  num_keypoints: int,
                  root_index: int,

diff --git a/mmpose/codecs/motionbert_label.py b/mmpose/codecs/motionbert_label.py
@@ -42,6 +42,15 @@ class MotionBERTLabel(BaseKeypointCodec):
         'lifting_target', 'lifting_target_visible', 'camera_param', 'factor'
     }
 
+    instance_mapping_table = dict(
+        lifting_target='lifting_target',
+        lifting_target_visible='lifting_target_visible',
+    )
+    label_mapping_table = dict(
+        trajectory_weights='trajectory_weights',
+        lifting_target_label='lifting_target_label',
+        lifting_target_weight='lifting_target_weight')
+
     def __init__(self,
                  num_keypoints: int,
                  root_index: int = 0,

diff --git a/mmpose/codecs/video_pose_lifting.py b/mmpose/codecs/video_pose_lifting.py
@@ -42,6 +42,15 @@ class VideoPoseLifting(BaseKeypointCodec):
         'lifting_target', 'lifting_target_visible', 'camera_param'
     }
 
+    instance_mapping_table = dict(
+        lifting_target='lifting_target',
+        lifting_target_visible='lifting_target_visible',
+    )
+    label_mapping_table = dict(
+        trajectory_weights='trajectory_weights',
+        lifting_target_label='lifting_target_label',
+        lifting_target_weight='lifting_target_weight')
+
     def __init__(self,
                  num_keypoints: int,
                  zero_center: bool = True,

diff --git a/mmpose/datasets/transforms/common_transforms.py b/mmpose/datasets/transforms/common_transforms.py
@@ -974,18 +974,38 @@ def transform(self, results: Dict) -> Optional[dict]:
                 keypoints_visible=keypoints_visible,
                 **auxiliary_encode_kwargs)
 
+            if self.encoder.field_mapping_table:
+                encoded[
+                    'field_mapping_table'] = self.encoder.field_mapping_table
+            if self.encoder.instance_mapping_table:
+                encoded['instance_mapping_table'] = \
+                    self.encoder.instance_mapping_table
+            if self.encoder.label_mapping_table:
+                encoded[
+                    'label_mapping_table'] = self.encoder.label_mapping_table
+
         else:
             encoded_list = []
             for _encoder in self.encoder:
                 auxiliary_encode_kwargs = {
                     key: results[key]
                     for key in _encoder.auxiliary_encode_keys
                 }
-                encoded_list.append(
-                    _encoder.encode(
-                        keypoints=keypoints,
-                        keypoints_visible=keypoints_visible,
-                        **auxiliary_encode_kwargs))
+                encoded = _encoder.encode(
+                    keypoints=keypoints,
+                    keypoints_visible=keypoints_visible,
+                    **auxiliary_encode_kwargs)
+
+                if _encoder.field_mapping_table:
+                    encoded['field_mapping_table'] = \
+                        self.encoder.field_mapping_table
+                if _encoder.instance_mapping_table:
+                    encoded['instance_mapping_table'] = \
+                        self.encoder.instance_mapping_table
+                if _encoder.label_mapping_table:
+                    encoded['label_mapping_table'] = \
+                        self.encoder.label_mapping_table
+                encoded_list.append(encoded)
 
             if self.multilevel:
                 # For multilevel encoding, the encoded items from each encoder

diff --git a/mmpose/datasets/transforms/formatting.py b/mmpose/datasets/transforms/formatting.py
@@ -104,44 +104,30 @@ class PackPoseInputs(BaseTransform):
 
     # items in `instance_mapping_table` will be directly packed into
     # PoseDataSample.gt_instances without converting to Tensor
-    instance_mapping_table = {
-        'bbox': 'bboxes',
-        'head_size': 'head_size',
-        'bbox_center': 'bbox_centers',
-        'bbox_scale': 'bbox_scales',
-        'bbox_score': 'bbox_scores',
-        'keypoints': 'keypoints',
-        'keypoints_visible': 'keypoints_visible',
-        'lifting_target': 'lifting_target',
-        'lifting_target_visible': 'lifting_target_visible',
-    }
-
-    # items in `label_mapping_table` will be packed into
-    # PoseDataSample.gt_instance_labels and converted to Tensor. These items
-    # will be used for computing losses
-    label_mapping_table = {
-        'keypoint_labels': 'keypoint_labels',
-        'lifting_target_label': 'lifting_target_label',
-        'lifting_target_weight': 'lifting_target_weight',
-        'trajectory_weights': 'trajectory_weights',
-        'keypoint_x_labels': 'keypoint_x_labels',
-        'keypoint_y_labels': 'keypoint_y_labels',
-        'keypoint_weights': 'keypoint_weights',
-        'instance_coords': 'instance_coords',
-        'keypoints_visible_weights': 'keypoints_visible_weights'
-    }
+    instance_mapping_table = dict(
+        bbox='bboxes',
+        bbox_score='bbox_scores',
+        keypoints='keypoints',
+        keypoints_visible='keypoints_visible')
 
     # items in `field_mapping_table` will be packed into
     # PoseDataSample.gt_fields and converted to Tensor. These items will be
     # used for computing losses
-    field_mapping_table = {
-        'heatmaps': 'heatmaps',
-        'instance_heatmaps': 'instance_heatmaps',
-        'heatmap_mask': 'heatmap_mask',
-        'heatmap_weights': 'heatmap_weights',
-        'displacements': 'displacements',
-        'displacement_weights': 'displacement_weights',
-    }
+    field_mapping_table = dict(
+        heatmaps='heatmaps',
+        instance_heatmaps='instance_heatmaps',
+        heatmap_mask='heatmap_mask',
+        heatmap_weights='heatmap_weights',
+        displacements='displacements',
+        displacement_weights='displacement_weights')
+
+    # items in `label_mapping_table` will be packed into
+    # PoseDataSample.gt_instance_labels and converted to Tensor. These items
+    # will be used for computing losses
+    label_mapping_table = dict(
+        keypoint_labels='keypoint_labels',
+        keypoint_weights='keypoint_weights',
+        keypoints_visible_weights='keypoints_visible_weights')
 
     def __init__(self,
                  meta_keys=('id', 'img_id', 'img_path', 'category_id',
@@ -182,12 +168,10 @@ def transform(self, results: dict) -> dict:
 
         # pack instance data
         gt_instances = InstanceData()
-        for key, packed_key in self.instance_mapping_table.items():
+        _instance_mapping_table = results.get('instance_mapping_table',
+                                              self.instance_mapping_table)
+        for key, packed_key in _instance_mapping_table.items():
             if key in results:
-                if 'lifting_target' in results and key in {
-                        'keypoints', 'keypoints_visible'
-                }:
-                    continue
                 gt_instances.set_field(results[key], packed_key)
 
         # pack `transformed_keypoints` for visualizing data transform
@@ -200,14 +184,10 @@ def transform(self, results: dict) -> dict:
 
         # pack instance labels
         gt_instance_labels = InstanceData()
-        for key, packed_key in self.label_mapping_table.items():
+        _label_mapping_table = results.get('label_mapping_table',
+                                           self.label_mapping_table)
+        for key, packed_key in _label_mapping_table.items():
             if key in results:
-                # For pose-lifting, store only target-related fields
-                if 'lifting_target' in results and packed_key in {
-                        'keypoint_labels', 'keypoint_weights',
-                        'keypoints_visible'
-                }:
-                    continue
                 if isinstance(results[key], list):
                     # A list of labels is usually generated by combined
                     # multiple encoders (See ``GenerateTarget`` in
@@ -222,7 +202,9 @@ def transform(self, results: dict) -> dict:
 
         # pack fields
         gt_fields = None
-        for key, packed_key in self.field_mapping_table.items():
+        _field_mapping_table = results.get('field_mapping_table',
+                                           self.field_mapping_table)
+        for key, packed_key in _field_mapping_table.items():
             if key in results:
                 if isinstance(results[key], list):
                     if gt_fields is None:

diff --git a/mmpose/datasets/transforms/pose3d_transforms.py b/mmpose/datasets/transforms/pose3d_transforms.py
@@ -110,9 +110,9 @@ def transform(self, results: Dict) -> dict:
                 lifting_target, lifting_target_visible, flip_indices,
                 **self.target_flip_cfg)
 
-            results[keypoints_key] = keypoints
-            results[keypoints_visible_key] = keypoints_visible
-            results[target_key] = lifting_target
+            results['keypoints_key'] = keypoints
+            results['keypoints_visible_key'] = keypoints_visible
+            results['target_key'] = lifting_target
             results['lifting_target_visible'] = lifting_target_visible
 
             # flip horizontal distortion coefficients